Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-05-13 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 12 May 2023 at 00:37, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c 
> > b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > new file mode 100644
> > index 000..598a51f17c6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > @@ -0,0 +1,20 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +
> > +#include 
> > +
> > +int16x8_t foo(int16_t x, int y)
> > +{
> > +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y};
> > +  return v;
> > +}
> > +
> > +int16x8_t foo2(int16_t x)
> > +{
> > +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1};
> > +  return v;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4h, w[0-9]+} 3 } } */
> > +/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4h, 0x1} } } */
> > +/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, 
> > v[0-9]+\.8h} } } */
>
> Would be good to make this a scan-assembler-times ... 2.
>
> OK with that change.  Thanks for doing this.
Thanks, committed the patch in:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=8b18714fbb1ca9812b33b3de75fe6ba4a57d4946
after bootstrap+test on aarch64-linux-gnu, and verifying bootstrap
passes on aarch64-linux-gnu with --enable-checking=all.

Thanks,
Prathamesh
>
> Richard


Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-05-11 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c 
> b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> new file mode 100644
> index 000..598a51f17c6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include 
> +
> +int16x8_t foo(int16_t x, int y)
> +{
> +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; 
> +  return v;
> +}
> +
> +int16x8_t foo2(int16_t x) 
> +{
> +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; 
> +  return v;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4h, w[0-9]+} 3 } } */
> +/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4h, 0x1} } } */
> +/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, 
> v[0-9]+\.8h} } } */

Would be good to make this a scan-assembler-times ... 2.

OK with that change.  Thanks for doing this.

Richard


Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-05-04 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 24 Apr 2023 at 15:00, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > [aarch64] Recursively intialize even and odd sub-parts and merge with zip1.
> >
> > gcc/ChangeLog:
> >   * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback): 
> > Rename
> >   aarch64_expand_vector_init to this, and remove  interleaving case.
> >   Recursively call aarch64_expand_vector_init_fallback, instead of
> >   aarch64_expand_vector_init.
> >   (aarch64_unzip_vector_init): New function.
> >   (aarch64_expand_vector_init): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >   * gcc.target/aarch64/ldp_stp_16.c (cons2_8_float): Adjust for new
> >   code-gen.
> >   * gcc.target/aarch64/sve/acle/general/dupq_5.c: Likewise.
> >   * gcc.target/aarch64/sve/acle/general/dupq_6.c: Likewise.
> >   * gcc.target/aarch64/vec-init-18.c: Rename interleave-init-1.c to
> >   this.
> >   * gcc.target/aarch64/vec-init-19.c: New test.
> >   * gcc.target/aarch64/vec-init-20.c: Likewise.
> >   * gcc.target/aarch64/vec-init-21.c: Likewise.
> >   * gcc.target/aarch64/vec-init-22-size.c: Likewise.
> >   * gcc.target/aarch64/vec-init-22-speed.c: Likewise.
> >   * gcc.target/aarch64/vec-init-22.h: New header.
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index d7e895f8d34..416e062829c 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -22026,11 +22026,12 @@ aarch64_simd_make_constant (rtx vals)
> >  return NULL_RTX;
> >  }
> >
> > -/* Expand a vector initialisation sequence, such that TARGET is
> > -   initialised to contain VALS.  */
> > +/* A subroutine of aarch64_expand_vector_init, with the same interface.
> > +   The caller has already tried a divide-and-conquer approach, so do
> > +   not consider that case here.  */
> >
> >  void
> > -aarch64_expand_vector_init (rtx target, rtx vals)
> > +aarch64_expand_vector_init_fallback (rtx target, rtx vals)
> >  {
> >machine_mode mode = GET_MODE (target);
> >scalar_mode inner_mode = GET_MODE_INNER (mode);
> > @@ -22090,38 +22091,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >return;
> >  }
> >
> > -  /* Check for interleaving case.
> > - For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> > - Generate following code:
> > - dup v0.h, x
> > - dup v1.h, y
> > - zip1 v0.h, v0.h, v1.h
> > - for "large enough" initializer.  */
> > -
> > -  if (n_elts >= 8)
> > -{
> > -  int i;
> > -  for (i = 2; i < n_elts; i++)
> > - if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> > -   break;
> > -
> > -  if (i == n_elts)
> > - {
> > -   machine_mode mode = GET_MODE (target);
> > -   rtx dest[2];
> > -
> > -   for (int i = 0; i < 2; i++)
> > - {
> > -   rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> > -   dest[i] = force_reg (mode, x);
> > - }
> > -
> > -   rtvec v = gen_rtvec (2, dest[0], dest[1]);
> > -   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> > -   return;
> > - }
> > -}
> > -
> >enum insn_code icode = optab_handler (vec_set_optab, mode);
> >gcc_assert (icode != CODE_FOR_nothing);
> >
> > @@ -22243,7 +22212,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >   }
> > XVECEXP (copy, 0, i) = subst;
> >   }
> > -  aarch64_expand_vector_init (target, copy);
> > +  aarch64_expand_vector_init_fallback (target, copy);
> >  }
> >
> >/* Insert the variable lanes directly.  */
> > @@ -22257,6 +6,81 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >  }
> >  }
> >
> > +/* Return even or odd half of VALS depending on EVEN_P.  */
> > +
> > +static rtx
> > +aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
> > +{
> > +  int n = XVECLEN (vals, 0);
> > +  machine_mode new_mode
> > += aarch64_simd_container_mode (GET_MODE_INNER (mode),
> > +GET_MODE_BITSIZE (mode).to_constant () / 
> > 2);
> > +  rtvec vec = rtvec_alloc (n / 2);
> > +  for (int i = 0; i < n/2; i++)
>
> Formatting nit: n / 2
>
> > +RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> > +   : XVECEXP (vals, 0, 2 * i + 1);
> > +  return gen_rtx_PARALLEL (new_mode, vec);
> > +}
> > +
> > +/* Expand a vector initialisation sequence, such that TARGET is
>
> initialization
>
> > +   initialized to contain VALS.  */
> > +
> > +void
> > +aarch64_expand_vector_init (rtx target, rtx vals)
> > +{
> > +  /* Try decomposing the initializer into even and odd halves and
> > + then ZIP them together.  Use the resulting sequence if it is
> > + strictly cheaper than loading VALS directly.
> > +
> > + Prefer the fallback sequence in the event of a tie, since it
> > + will tend to use fewer registers.  */
> > +
> > +  machi

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-04-24 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> [aarch64] Recursively intialize even and odd sub-parts and merge with zip1.
>
> gcc/ChangeLog:
>   * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback): 
> Rename
>   aarch64_expand_vector_init to this, and remove  interleaving case.
>   Recursively call aarch64_expand_vector_init_fallback, instead of
>   aarch64_expand_vector_init.
>   (aarch64_unzip_vector_init): New function.
>   (aarch64_expand_vector_init): Likewise.
>
> gcc/testsuite/ChangeLog:
>   * gcc.target/aarch64/ldp_stp_16.c (cons2_8_float): Adjust for new
>   code-gen.
>   * gcc.target/aarch64/sve/acle/general/dupq_5.c: Likewise.
>   * gcc.target/aarch64/sve/acle/general/dupq_6.c: Likewise.
>   * gcc.target/aarch64/vec-init-18.c: Rename interleave-init-1.c to
>   this.
>   * gcc.target/aarch64/vec-init-19.c: New test.
>   * gcc.target/aarch64/vec-init-20.c: Likewise.
>   * gcc.target/aarch64/vec-init-21.c: Likewise.
>   * gcc.target/aarch64/vec-init-22-size.c: Likewise.
>   * gcc.target/aarch64/vec-init-22-speed.c: Likewise.
>   * gcc.target/aarch64/vec-init-22.h: New header.
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index d7e895f8d34..416e062829c 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -22026,11 +22026,12 @@ aarch64_simd_make_constant (rtx vals)
>  return NULL_RTX;
>  }
>  
> -/* Expand a vector initialisation sequence, such that TARGET is
> -   initialised to contain VALS.  */
> +/* A subroutine of aarch64_expand_vector_init, with the same interface.
> +   The caller has already tried a divide-and-conquer approach, so do
> +   not consider that case here.  */
>  
>  void
> -aarch64_expand_vector_init (rtx target, rtx vals)
> +aarch64_expand_vector_init_fallback (rtx target, rtx vals)
>  {
>machine_mode mode = GET_MODE (target);
>scalar_mode inner_mode = GET_MODE_INNER (mode);
> @@ -22090,38 +22091,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>return;
>  }
>  
> -  /* Check for interleaving case.
> - For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> - Generate following code:
> - dup v0.h, x
> - dup v1.h, y
> - zip1 v0.h, v0.h, v1.h
> - for "large enough" initializer.  */
> -
> -  if (n_elts >= 8)
> -{
> -  int i;
> -  for (i = 2; i < n_elts; i++)
> - if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> -   break;
> -
> -  if (i == n_elts)
> - {
> -   machine_mode mode = GET_MODE (target);
> -   rtx dest[2];
> -
> -   for (int i = 0; i < 2; i++)
> - {
> -   rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> -   dest[i] = force_reg (mode, x);
> - }
> -
> -   rtvec v = gen_rtvec (2, dest[0], dest[1]);
> -   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> -   return;
> - }
> -}
> -
>enum insn_code icode = optab_handler (vec_set_optab, mode);
>gcc_assert (icode != CODE_FOR_nothing);
>  
> @@ -22243,7 +22212,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>   }
> XVECEXP (copy, 0, i) = subst;
>   }
> -  aarch64_expand_vector_init (target, copy);
> +  aarch64_expand_vector_init_fallback (target, copy);
>  }
>  
>/* Insert the variable lanes directly.  */
> @@ -22257,6 +6,81 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>  }
>  }
>  
> +/* Return even or odd half of VALS depending on EVEN_P.  */
> +
> +static rtx
> +aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
> +{
> +  int n = XVECLEN (vals, 0);
> +  machine_mode new_mode
> += aarch64_simd_container_mode (GET_MODE_INNER (mode),
> +GET_MODE_BITSIZE (mode).to_constant () / 2);
> +  rtvec vec = rtvec_alloc (n / 2);
> +  for (int i = 0; i < n/2; i++)

Formatting nit: n / 2

> +RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> +   : XVECEXP (vals, 0, 2 * i + 1);
> +  return gen_rtx_PARALLEL (new_mode, vec);
> +}
> +
> +/* Expand a vector initialisation sequence, such that TARGET is

initialization

> +   initialized to contain VALS.  */
> +
> +void
> +aarch64_expand_vector_init (rtx target, rtx vals)
> +{
> +  /* Try decomposing the initializer into even and odd halves and
> + then ZIP them together.  Use the resulting sequence if it is
> + strictly cheaper than loading VALS directly.
> +
> + Prefer the fallback sequence in the event of a tie, since it
> + will tend to use fewer registers.  */
> +
> +  machine_mode mode = GET_MODE (target);
> +  int n_elts = XVECLEN (vals, 0);
> +
> +  if (n_elts < 4
> +  || maybe_ne (GET_MODE_BITSIZE (mode), 128))
> +{
> +  aarch64_expand_vector_init_fallback (target, vals);
> +  return;
> +}
> +
> +  start_sequence ();
> +  rtx halves[2];
> +  unsigned costs[2];

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-04-22 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 21 Apr 2023 at 20:45, Prathamesh Kulkarni
 wrote:
>
> On Fri, 21 Apr 2023 at 14:47, Richard Sandiford
>  wrote:
> >
> > Prathamesh Kulkarni  writes:
> > > Hi,
> > > I tested the interleave+zip1 for vector init patch and it segfaulted
> > > during bootstrap while trying to build
> > > libgfortran/generated/matmul_i2.c.
> > > Rebuilding with --enable-checking=rtl showed out of bounds access in
> > > aarch64_unzip_vector_init in following hunk:
> > >
> > > +  rtvec vec = rtvec_alloc (n / 2);
> > > +  for (int i = 0; i < n; i++)
> > > +RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> > > + : XVECEXP (vals, 0, 2 * i + 1);
> > >
> > > which is incorrect since it allocates n/2 but iterates and stores upto n.
> > > The attached patch fixes the issue, which passed bootstrap, however
> > > resulted in following fallout during testsuite run:
> > >
> > > 1] sve/acle/general/dupq_[1-4].c tests fail.
> > > For the following test:
> > > int32x4_t f(int32_t x)
> > > {
> > >   return (int32x4_t) { x, 1, 2, 3 };
> > > }
> > >
> > > Code-gen without patch:
> > > f:
> > > adrpx1, .LC0
> > > ldr q0, [x1, #:lo12:.LC0]
> > > ins v0.s[0], w0
> > > ret
> > >
> > > Code-gen with patch:
> > > f:
> > > moviv0.2s, 0x2
> > > adrpx1, .LC0
> > > ldr d1, [x1, #:lo12:.LC0]
> > > ins v0.s[0], w0
> > > zip1v0.4s, v0.4s, v1.4s
> > > ret
> > >
> > > It shows, fallback_seq_cost = 20, seq_total_cost = 16
> > > where seq_total_cost determines the cost for interleave+zip1 sequence
> > > and fallback_seq_cost is the cost for fallback sequence.
> > > Altho it shows lesser cost, I am not sure if the interleave+zip1
> > > sequence is better in this case ?
> >
> > Debugging the patch, it looks like this is because the fallback sequence
> > contains a redundant pseudo-to-pseudo move, which is costed as 1
> > instruction (4 units).  The RTL equivalent of the:
> >
> >  moviv0.2s, 0x2
> >  ins v0.s[0], w0
> >
> > has a similar redundant move, but the cost of that move is subsumed by
> > the cost of the other arm (the load from LC0), which is costed as 3
> > instructions (12 units).  So we have 12 + 4 for the parallel version
> > (correct) but 12 + 4 + 4 for the serial version (one instruction too
> > many).
> >
> > The reason we have redundant moves is that the expansion code uses
> > copy_to_mode_reg to force a value into a register.  This creates a
> > new pseudo even if the original value was already a register.
> > Using force_reg removes the moves and makes the test pass.
> >
> > So I think the first step is to use force_reg instead of
> > copy_to_mode_reg in aarch64_simd_dup_constant and
> > aarch64_expand_vector_init (as a preparatory patch).
> Thanks for the clarification!
> >
> > > 2] sve/acle/general/dupq_[5-6].c tests fail:
> > > int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
> > > {
> > >   return (int32x4_t) { x0, x1, x2, x3 };
> > > }
> > >
> > > code-gen without patch:
> > > f:
> > > fmovs0, w0
> > > ins v0.s[1], w1
> > > ins v0.s[2], w2
> > > ins v0.s[3], w3
> > > ret
> > >
> > > code-gen with patch:
> > > f:
> > > fmovs0, w0
> > > fmovs1, w1
> > > ins v0.s[1], w2
> > > ins v1.s[1], w3
> > > zip1v0.4s, v0.4s, v1.4s
> > > ret
> > >
> > > It shows fallback_seq_cost = 28, seq_total_cost = 16
> >
> > The zip verson still wins after the fix above, but by a lesser amount.
> > It seems like a borderline case.
> >
> > >
> > > 3] aarch64/ldp_stp_16.c's cons2_8_float test fails.
> > > Test case:
> > > void cons2_8_float(float *x, float val0, float val1)
> > > {
> > > #pragma GCC unroll(8)
> > >   for (int i = 0; i < 8 * 2; i += 2) {
> > > x[i + 0] = val0;
> > > x[i + 1] = val1;
> > >   }
> > > }
> > >
> > > which is lowered to:
> > > void cons2_8_float (float * x, float val0, float val1)
> > > {
> > >   vector(4) float _86;
> > >
> > >[local count: 119292720]:
> > >   _86 = {val0_11(D), val1_13(D), val0_11(D), val1_13(D)};
> > >   MEM  [(float *)x_10(D)] = _86;
> > >   MEM  [(float *)x_10(D) + 16B] = _86;
> > >   MEM  [(float *)x_10(D) + 32B] = _86;
> > >   MEM  [(float *)x_10(D) + 48B] = _86;
> > >   return;
> > > }
> > >
> > > code-gen without patch:
> > > cons2_8_float:
> > > dup v0.4s, v0.s[0]
> > > ins v0.s[1], v1.s[0]
> > > ins v0.s[3], v1.s[0]
> > > stp q0, q0, [x0]
> > > stp q0, q0, [x0, 32]
> > > ret
> > >
> > > code-gen with patch:
> > > cons2_8_float:
> > > dup v1.2s, v1.s[0]
> > > dup v0.2s, v0.s[0]
> > > zip1v0.4s, v0.4s, v1.4s
> > > stp q0, q0, [x0]
> > > stp q0, q0, [x0, 32]
> > > ret
> > >
> > > It shows fallback_seq_cost = 28, seq_total_cost = 16
> > >
> > > I think the test 

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-04-21 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 21 Apr 2023 at 14:47, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi,
> > I tested the interleave+zip1 for vector init patch and it segfaulted
> > during bootstrap while trying to build
> > libgfortran/generated/matmul_i2.c.
> > Rebuilding with --enable-checking=rtl showed out of bounds access in
> > aarch64_unzip_vector_init in following hunk:
> >
> > +  rtvec vec = rtvec_alloc (n / 2);
> > +  for (int i = 0; i < n; i++)
> > +RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> > + : XVECEXP (vals, 0, 2 * i + 1);
> >
> > which is incorrect since it allocates n/2 but iterates and stores upto n.
> > The attached patch fixes the issue, which passed bootstrap, however
> > resulted in following fallout during testsuite run:
> >
> > 1] sve/acle/general/dupq_[1-4].c tests fail.
> > For the following test:
> > int32x4_t f(int32_t x)
> > {
> >   return (int32x4_t) { x, 1, 2, 3 };
> > }
> >
> > Code-gen without patch:
> > f:
> > adrpx1, .LC0
> > ldr q0, [x1, #:lo12:.LC0]
> > ins v0.s[0], w0
> > ret
> >
> > Code-gen with patch:
> > f:
> > moviv0.2s, 0x2
> > adrpx1, .LC0
> > ldr d1, [x1, #:lo12:.LC0]
> > ins v0.s[0], w0
> > zip1v0.4s, v0.4s, v1.4s
> > ret
> >
> > It shows, fallback_seq_cost = 20, seq_total_cost = 16
> > where seq_total_cost determines the cost for interleave+zip1 sequence
> > and fallback_seq_cost is the cost for fallback sequence.
> > Altho it shows lesser cost, I am not sure if the interleave+zip1
> > sequence is better in this case ?
>
> Debugging the patch, it looks like this is because the fallback sequence
> contains a redundant pseudo-to-pseudo move, which is costed as 1
> instruction (4 units).  The RTL equivalent of the:
>
>  moviv0.2s, 0x2
>  ins v0.s[0], w0
>
> has a similar redundant move, but the cost of that move is subsumed by
> the cost of the other arm (the load from LC0), which is costed as 3
> instructions (12 units).  So we have 12 + 4 for the parallel version
> (correct) but 12 + 4 + 4 for the serial version (one instruction too
> many).
>
> The reason we have redundant moves is that the expansion code uses
> copy_to_mode_reg to force a value into a register.  This creates a
> new pseudo even if the original value was already a register.
> Using force_reg removes the moves and makes the test pass.
>
> So I think the first step is to use force_reg instead of
> copy_to_mode_reg in aarch64_simd_dup_constant and
> aarch64_expand_vector_init (as a preparatory patch).
Thanks for the clarification!
>
> > 2] sve/acle/general/dupq_[5-6].c tests fail:
> > int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
> > {
> >   return (int32x4_t) { x0, x1, x2, x3 };
> > }
> >
> > code-gen without patch:
> > f:
> > fmovs0, w0
> > ins v0.s[1], w1
> > ins v0.s[2], w2
> > ins v0.s[3], w3
> > ret
> >
> > code-gen with patch:
> > f:
> > fmovs0, w0
> > fmovs1, w1
> > ins v0.s[1], w2
> > ins v1.s[1], w3
> > zip1v0.4s, v0.4s, v1.4s
> > ret
> >
> > It shows fallback_seq_cost = 28, seq_total_cost = 16
>
> The zip verson still wins after the fix above, but by a lesser amount.
> It seems like a borderline case.
>
> >
> > 3] aarch64/ldp_stp_16.c's cons2_8_float test fails.
> > Test case:
> > void cons2_8_float(float *x, float val0, float val1)
> > {
> > #pragma GCC unroll(8)
> >   for (int i = 0; i < 8 * 2; i += 2) {
> > x[i + 0] = val0;
> > x[i + 1] = val1;
> >   }
> > }
> >
> > which is lowered to:
> > void cons2_8_float (float * x, float val0, float val1)
> > {
> >   vector(4) float _86;
> >
> >[local count: 119292720]:
> >   _86 = {val0_11(D), val1_13(D), val0_11(D), val1_13(D)};
> >   MEM  [(float *)x_10(D)] = _86;
> >   MEM  [(float *)x_10(D) + 16B] = _86;
> >   MEM  [(float *)x_10(D) + 32B] = _86;
> >   MEM  [(float *)x_10(D) + 48B] = _86;
> >   return;
> > }
> >
> > code-gen without patch:
> > cons2_8_float:
> > dup v0.4s, v0.s[0]
> > ins v0.s[1], v1.s[0]
> > ins v0.s[3], v1.s[0]
> > stp q0, q0, [x0]
> > stp q0, q0, [x0, 32]
> > ret
> >
> > code-gen with patch:
> > cons2_8_float:
> > dup v1.2s, v1.s[0]
> > dup v0.2s, v0.s[0]
> > zip1v0.4s, v0.4s, v1.4s
> > stp q0, q0, [x0]
> > stp q0, q0, [x0, 32]
> > ret
> >
> > It shows fallback_seq_cost = 28, seq_total_cost = 16
> >
> > I think the test fails because it doesn't match:
> > **  dup v([0-9]+)\.4s, .*
> >
> > Shall it be OK to amend the test assuming code-gen with patch is better ?
>
> Yeah, the new code seems like an improvement.
>
> > 4] aarch64/pr109072_1.c s32x4_3 test fails:
> > For the following test:
> > int32x4_t s32x4_3 (int32_t x, int32_t y)
> > {
> >   int32_t arr[] = {

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-04-21 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> Hi,
> I tested the interleave+zip1 for vector init patch and it segfaulted
> during bootstrap while trying to build
> libgfortran/generated/matmul_i2.c.
> Rebuilding with --enable-checking=rtl showed out of bounds access in
> aarch64_unzip_vector_init in following hunk:
>
> +  rtvec vec = rtvec_alloc (n / 2);
> +  for (int i = 0; i < n; i++)
> +RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> + : XVECEXP (vals, 0, 2 * i + 1);
>
> which is incorrect since it allocates n/2 but iterates and stores upto n.
> The attached patch fixes the issue, which passed bootstrap, however
> resulted in following fallout during testsuite run:
>
> 1] sve/acle/general/dupq_[1-4].c tests fail.
> For the following test:
> int32x4_t f(int32_t x)
> {
>   return (int32x4_t) { x, 1, 2, 3 };
> }
>
> Code-gen without patch:
> f:
> adrpx1, .LC0
> ldr q0, [x1, #:lo12:.LC0]
> ins v0.s[0], w0
> ret
>
> Code-gen with patch:
> f:
> moviv0.2s, 0x2
> adrpx1, .LC0
> ldr d1, [x1, #:lo12:.LC0]
> ins v0.s[0], w0
> zip1v0.4s, v0.4s, v1.4s
> ret
>
> It shows, fallback_seq_cost = 20, seq_total_cost = 16
> where seq_total_cost determines the cost for interleave+zip1 sequence
> and fallback_seq_cost is the cost for fallback sequence.
> Altho it shows lesser cost, I am not sure if the interleave+zip1
> sequence is better in this case ?

Debugging the patch, it looks like this is because the fallback sequence
contains a redundant pseudo-to-pseudo move, which is costed as 1
instruction (4 units).  The RTL equivalent of the:

 moviv0.2s, 0x2
 ins v0.s[0], w0

has a similar redundant move, but the cost of that move is subsumed by
the cost of the other arm (the load from LC0), which is costed as 3
instructions (12 units).  So we have 12 + 4 for the parallel version
(correct) but 12 + 4 + 4 for the serial version (one instruction too
many).

The reason we have redundant moves is that the expansion code uses
copy_to_mode_reg to force a value into a register.  This creates a
new pseudo even if the original value was already a register.
Using force_reg removes the moves and makes the test pass.

So I think the first step is to use force_reg instead of
copy_to_mode_reg in aarch64_simd_dup_constant and
aarch64_expand_vector_init (as a preparatory patch).

> 2] sve/acle/general/dupq_[5-6].c tests fail:
> int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
> {
>   return (int32x4_t) { x0, x1, x2, x3 };
> }
>
> code-gen without patch:
> f:
> fmovs0, w0
> ins v0.s[1], w1
> ins v0.s[2], w2
> ins v0.s[3], w3
> ret
>
> code-gen with patch:
> f:
> fmovs0, w0
> fmovs1, w1
> ins v0.s[1], w2
> ins v1.s[1], w3
> zip1v0.4s, v0.4s, v1.4s
> ret
>
> It shows fallback_seq_cost = 28, seq_total_cost = 16

The zip verson still wins after the fix above, but by a lesser amount.
It seems like a borderline case.

>
> 3] aarch64/ldp_stp_16.c's cons2_8_float test fails.
> Test case:
> void cons2_8_float(float *x, float val0, float val1)
> {
> #pragma GCC unroll(8)
>   for (int i = 0; i < 8 * 2; i += 2) {
> x[i + 0] = val0;
> x[i + 1] = val1;
>   }
> }
>
> which is lowered to:
> void cons2_8_float (float * x, float val0, float val1)
> {
>   vector(4) float _86;
>
>[local count: 119292720]:
>   _86 = {val0_11(D), val1_13(D), val0_11(D), val1_13(D)};
>   MEM  [(float *)x_10(D)] = _86;
>   MEM  [(float *)x_10(D) + 16B] = _86;
>   MEM  [(float *)x_10(D) + 32B] = _86;
>   MEM  [(float *)x_10(D) + 48B] = _86;
>   return;
> }
>
> code-gen without patch:
> cons2_8_float:
> dup v0.4s, v0.s[0]
> ins v0.s[1], v1.s[0]
> ins v0.s[3], v1.s[0]
> stp q0, q0, [x0]
> stp q0, q0, [x0, 32]
> ret
>
> code-gen with patch:
> cons2_8_float:
> dup v1.2s, v1.s[0]
> dup v0.2s, v0.s[0]
> zip1v0.4s, v0.4s, v1.4s
> stp q0, q0, [x0]
> stp q0, q0, [x0, 32]
> ret
>
> It shows fallback_seq_cost = 28, seq_total_cost = 16
>
> I think the test fails because it doesn't match:
> **  dup v([0-9]+)\.4s, .*
>
> Shall it be OK to amend the test assuming code-gen with patch is better ?

Yeah, the new code seems like an improvement.

> 4] aarch64/pr109072_1.c s32x4_3 test fails:
> For the following test:
> int32x4_t s32x4_3 (int32_t x, int32_t y)
> {
>   int32_t arr[] = { x, y, y, y };
>   return vld1q_s32 (arr);
> }
>
> code-gen without patch:
> s32x4_3:
> dup v0.4s, w1
> ins v0.s[0], w0
> ret
>
> code-gen with patch:
> s32x4_3:
> fmovs1, w1
> fmovs0, w0
> ins v0.s[1], v1.s[0]
> dup v1.2s, v1.s[0]
> zip1v0.4s, v0.4s, v1.4s
> ret
>
> It shows fallback_

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-04-21 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 12 Apr 2023 at 14:29, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Thu, 6 Apr 2023 at 16:05, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > On Tue, 4 Apr 2023 at 23:35, Richard Sandiford
> >> >  wrote:
> >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> >> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> > index cd9cace3c9b..3de79060619 100644
> >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> > @@ -817,6 +817,62 @@ public:
> >> >> >
> >> >> >  class svdupq_impl : public quiet
> >> >> >  {
> >> >> > +private:
> >> >> > +  gimple *
> >> >> > +  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
> >> >> > +  {
> >> >> > +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
> >> >> > +   tmp = {arg0, arg1, ..., arg}
> >> >> > +   lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
> >> >> > +
> >> >> > +/* TODO: Revisit to handle factor by padding zeros.  */
> >> >> > +if (factor > 1)
> >> >> > +  return NULL;
> >> >>
> >> >> Isn't the key thing here predicate vs. vector rather than factor == 1 
> >> >> vs.
> >> >> factor != 1?  Do we generate good code for b8, where factor should be 1?
> >> > Hi,
> >> > It generates the following code for svdup_n_b8:
> >> > https://pastebin.com/ypYt590c
> >>
> >> Hmm, yeah, not pretty :-)  But it's not pretty without either.
> >>
> >> > I suppose lowering to ctor+vec_perm_expr is not really useful
> >> > for this case because it won't simplify ctor, unlike the above case of
> >> > svdupq_s32 (x[0], x[1], x[2], x[3]);
> >> > However I wonder if it's still a good idea to lower svdupq for 
> >> > predicates, for
> >> > representing svdupq (or other intrinsics) using GIMPLE constructs as
> >> > far as possible ?
> >>
> >> It's possible, but I think we'd need an example in which its a clear
> >> benefit.
> > Sorry I posted for wrong test case above.
> > For the following test:
> > svbool_t f(uint8x16_t x)
> > {
> >   return svdupq_n_b8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
> > x[8], x[9], x[10], x[11], x[12],
> > x[13], x[14], x[15]);
> > }
> >
> > Code-gen:
> > https://pastebin.com/maexgeJn
> >
> > I suppose it's equivalent to following ?
> >
> > svbool_t f2(uint8x16_t x)
> > {
> >   svuint8_t tmp = svdupq_n_u8 ((bool) x[0], (bool) x[1], (bool) x[2],
> > (bool) x[3],
> >(bool) x[4], (bool) x[5], (bool) x[6],
> > (bool) x[7],
> >(bool) x[8], (bool) x[9], (bool) x[10],
> > (bool) x[11],
> >(bool) x[12], (bool) x[13], (bool)
> > x[14], (bool) x[15]);
> >   return svcmpne_n_u8 (svptrue_b8 (), tmp, 0);
> > }
>
> Yeah, this is essentially the transformation that the svdupq rtl
> expander uses.  It would probably be a good idea to do that in
> gimple too.
Hi,
I tested the interleave+zip1 for vector init patch and it segfaulted
during bootstrap while trying to build
libgfortran/generated/matmul_i2.c.
Rebuilding with --enable-checking=rtl showed out of bounds access in
aarch64_unzip_vector_init in following hunk:

+  rtvec vec = rtvec_alloc (n / 2);
+  for (int i = 0; i < n; i++)
+RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
+ : XVECEXP (vals, 0, 2 * i + 1);

which is incorrect since it allocates n/2 but iterates and stores upto n.
The attached patch fixes the issue, which passed bootstrap, however
resulted in following fallout during testsuite run:

1] sve/acle/general/dupq_[1-4].c tests fail.
For the following test:
int32x4_t f(int32_t x)
{
  return (int32x4_t) { x, 1, 2, 3 };
}

Code-gen without patch:
f:
adrpx1, .LC0
ldr q0, [x1, #:lo12:.LC0]
ins v0.s[0], w0
ret

Code-gen with patch:
f:
moviv0.2s, 0x2
adrpx1, .LC0
ldr d1, [x1, #:lo12:.LC0]
ins v0.s[0], w0
zip1v0.4s, v0.4s, v1.4s
ret

It shows, fallback_seq_cost = 20, seq_total_cost = 16
where seq_total_cost determines the cost for interleave+zip1 sequence
and fallback_seq_cost is the cost for fallback sequence.
Altho it shows lesser cost, I am not sure if the interleave+zip1
sequence is better in this case ?

2] sve/acle/general/dupq_[5-6].c tests fail:
int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
{
  return (int32x4_t) { x0, x1, x2, x3 };
}

code-gen without patch:
f:
fmovs0, w0
ins v0.s[1], w1
ins v0.s[2], w2
ins v0.s[3], w3
ret

code-gen with patch:
f:
fmovs0, w0
fmovs1, w1
ins v0.s[1], w2
ins v1.s[1], w3
zip1v0.4s, v0.4s, v1.4s
ret

It shows fallback_seq_cost = 28, seq_total_cost = 16

3] aarch64/ldp_stp_16.c's cons2_8_float test fails.
Test case:
void cons2_8_float(float *x, float va

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-04-12 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> On Thu, 6 Apr 2023 at 16:05, Richard Sandiford
>  wrote:
>>
>> Prathamesh Kulkarni  writes:
>> > On Tue, 4 Apr 2023 at 23:35, Richard Sandiford
>> >  wrote:
>> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
>> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > index cd9cace3c9b..3de79060619 100644
>> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > @@ -817,6 +817,62 @@ public:
>> >> >
>> >> >  class svdupq_impl : public quiet
>> >> >  {
>> >> > +private:
>> >> > +  gimple *
>> >> > +  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
>> >> > +  {
>> >> > +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
>> >> > +   tmp = {arg0, arg1, ..., arg}
>> >> > +   lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
>> >> > +
>> >> > +/* TODO: Revisit to handle factor by padding zeros.  */
>> >> > +if (factor > 1)
>> >> > +  return NULL;
>> >>
>> >> Isn't the key thing here predicate vs. vector rather than factor == 1 vs.
>> >> factor != 1?  Do we generate good code for b8, where factor should be 1?
>> > Hi,
>> > It generates the following code for svdup_n_b8:
>> > https://pastebin.com/ypYt590c
>>
>> Hmm, yeah, not pretty :-)  But it's not pretty without either.
>>
>> > I suppose lowering to ctor+vec_perm_expr is not really useful
>> > for this case because it won't simplify ctor, unlike the above case of
>> > svdupq_s32 (x[0], x[1], x[2], x[3]);
>> > However I wonder if it's still a good idea to lower svdupq for predicates, 
>> > for
>> > representing svdupq (or other intrinsics) using GIMPLE constructs as
>> > far as possible ?
>>
>> It's possible, but I think we'd need an example in which its a clear
>> benefit.
> Sorry I posted for wrong test case above.
> For the following test:
> svbool_t f(uint8x16_t x)
> {
>   return svdupq_n_b8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
> x[8], x[9], x[10], x[11], x[12],
> x[13], x[14], x[15]);
> }
>
> Code-gen:
> https://pastebin.com/maexgeJn
>
> I suppose it's equivalent to following ?
>
> svbool_t f2(uint8x16_t x)
> {
>   svuint8_t tmp = svdupq_n_u8 ((bool) x[0], (bool) x[1], (bool) x[2],
> (bool) x[3],
>(bool) x[4], (bool) x[5], (bool) x[6],
> (bool) x[7],
>(bool) x[8], (bool) x[9], (bool) x[10],
> (bool) x[11],
>(bool) x[12], (bool) x[13], (bool)
> x[14], (bool) x[15]);
>   return svcmpne_n_u8 (svptrue_b8 (), tmp, 0);
> }

Yeah, this is essentially the transformation that the svdupq rtl
expander uses.  It would probably be a good idea to do that in
gimple too.

Thanks,
Richard

>
> which generates:
> f2:
> .LFB3901:
> .cfi_startproc
> moviv1.16b, 0x1
> ptrue   p0.b, all
> cmeqv0.16b, v0.16b, #0
> bic v0.16b, v1.16b, v0.16b
> dup z0.q, z0.q[0]
> cmpne   p0.b, p0/z, z0.b, #0
> ret
>
> Thanks,
> Prathamesh


Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-04-06 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 6 Apr 2023 at 16:05, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Tue, 4 Apr 2023 at 23:35, Richard Sandiford
> >  wrote:
> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > index cd9cace3c9b..3de79060619 100644
> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > @@ -817,6 +817,62 @@ public:
> >> >
> >> >  class svdupq_impl : public quiet
> >> >  {
> >> > +private:
> >> > +  gimple *
> >> > +  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
> >> > +  {
> >> > +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
> >> > +   tmp = {arg0, arg1, ..., arg}
> >> > +   lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
> >> > +
> >> > +/* TODO: Revisit to handle factor by padding zeros.  */
> >> > +if (factor > 1)
> >> > +  return NULL;
> >>
> >> Isn't the key thing here predicate vs. vector rather than factor == 1 vs.
> >> factor != 1?  Do we generate good code for b8, where factor should be 1?
> > Hi,
> > It generates the following code for svdup_n_b8:
> > https://pastebin.com/ypYt590c
>
> Hmm, yeah, not pretty :-)  But it's not pretty without either.
>
> > I suppose lowering to ctor+vec_perm_expr is not really useful
> > for this case because it won't simplify ctor, unlike the above case of
> > svdupq_s32 (x[0], x[1], x[2], x[3]);
> > However I wonder if it's still a good idea to lower svdupq for predicates, 
> > for
> > representing svdupq (or other intrinsics) using GIMPLE constructs as
> > far as possible ?
>
> It's possible, but I think we'd need an example in which its a clear
> benefit.
Sorry I posted for wrong test case above.
For the following test:
svbool_t f(uint8x16_t x)
{
  return svdupq_n_b8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
x[8], x[9], x[10], x[11], x[12],
x[13], x[14], x[15]);
}

Code-gen:
https://pastebin.com/maexgeJn

I suppose it's equivalent to following ?

svbool_t f2(uint8x16_t x)
{
  svuint8_t tmp = svdupq_n_u8 ((bool) x[0], (bool) x[1], (bool) x[2],
(bool) x[3],
   (bool) x[4], (bool) x[5], (bool) x[6],
(bool) x[7],
   (bool) x[8], (bool) x[9], (bool) x[10],
(bool) x[11],
   (bool) x[12], (bool) x[13], (bool)
x[14], (bool) x[15]);
  return svcmpne_n_u8 (svptrue_b8 (), tmp, 0);
}

which generates:
f2:
.LFB3901:
.cfi_startproc
moviv1.16b, 0x1
ptrue   p0.b, all
cmeqv0.16b, v0.16b, #0
bic v0.16b, v1.16b, v0.16b
dup z0.q, z0.q[0]
cmpne   p0.b, p0/z, z0.b, #0
ret

Thanks,
Prathamesh
>
> > In the attached patch, it simply punts if the type
> > suffix is b,
> > and doesn't try to fold the call.
>
> Yeah, think that's best for now.
>
> >> > +
> >> > +if (BYTES_BIG_ENDIAN)
> >> > +  return NULL;
> >> > +
> >> > +tree lhs = gimple_call_lhs (f.call);
> >> > +if (TREE_CODE (lhs) != SSA_NAME)
> >> > +  return NULL;
> >>
> >> Why is this check needed?
> > This was a left-over from something else I was doing wrongly. Sorry I
> > forgot to remove it.
> >>
> >> > +tree lhs_type = TREE_TYPE (lhs);
> >> > +tree elt_type = TREE_TYPE (lhs_type);
> >> > +scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type));
> >>
> >> Aren't we already dealing with a scalar type here?  I'd have expected
> >> SCALAR_TYPE_MODE rather than GET_MODE_INNER (TYPE_MODE ...).
> > Ugh, sorry, I had most of the code copied over from svld1rq_impl for
> > building VEC_PERM_EXPR with VLA mask and adjusted it,
> > but overlooked this :/
> >>
> >> > +machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
> >> > +tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
> >> > +
> >> > +unsigned nargs = gimple_call_num_args (f.call);
> >> > +vec *v;
> >> > +vec_alloc (v, nargs);
> >> > +for (unsigned i = 0; i < nargs; i++)
> >> > +  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, 
> >> > i));
> >> > +tree vec = build_constructor (vq_type, v);
> >> > +
> >> > +tree access_type
> >> > +  = build_aligned_type (vq_type, TYPE_ALIGN (elt_type));
> >>
> >> Nit: seems to fit on one line.  But do we need this?  We're not accessing
> >> memory, so I'd have expected vq_type to be OK as-is.
> >>
> >> > +tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> >> > +gimple *g = gimple_build_assign (tmp, vec);
> >> > +
> >> > +gimple_seq stmts = NULL;
> >> > +gimple_seq_add_stmt_without_update (&stmts, g);
> >> > +
> >> > +int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant 
> >> > ();
> >>
> >> Looks like we should be able to use nargs instead of source_nelts.
> > Does the attached patch look OK ?
> >
> > Thanks,
> > Prathamesh
> >>
> >
> >> Thanks,
> >> Richard
> >>
> >> > +poly_ui

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-04-06 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> On Tue, 4 Apr 2023 at 23:35, Richard Sandiford
>  wrote:
>> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
>> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > index cd9cace3c9b..3de79060619 100644
>> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > @@ -817,6 +817,62 @@ public:
>> >
>> >  class svdupq_impl : public quiet
>> >  {
>> > +private:
>> > +  gimple *
>> > +  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
>> > +  {
>> > +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
>> > +   tmp = {arg0, arg1, ..., arg}
>> > +   lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
>> > +
>> > +/* TODO: Revisit to handle factor by padding zeros.  */
>> > +if (factor > 1)
>> > +  return NULL;
>>
>> Isn't the key thing here predicate vs. vector rather than factor == 1 vs.
>> factor != 1?  Do we generate good code for b8, where factor should be 1?
> Hi,
> It generates the following code for svdup_n_b8:
> https://pastebin.com/ypYt590c

Hmm, yeah, not pretty :-)  But it's not pretty without either.

> I suppose lowering to ctor+vec_perm_expr is not really useful
> for this case because it won't simplify ctor, unlike the above case of
> svdupq_s32 (x[0], x[1], x[2], x[3]);
> However I wonder if it's still a good idea to lower svdupq for predicates, for
> representing svdupq (or other intrinsics) using GIMPLE constructs as
> far as possible ?

It's possible, but I think we'd need an example in which its a clear
benefit.

> In the attached patch, it simply punts if the type
> suffix is b,
> and doesn't try to fold the call.

Yeah, think that's best for now.

>> > +
>> > +if (BYTES_BIG_ENDIAN)
>> > +  return NULL;
>> > +
>> > +tree lhs = gimple_call_lhs (f.call);
>> > +if (TREE_CODE (lhs) != SSA_NAME)
>> > +  return NULL;
>>
>> Why is this check needed?
> This was a left-over from something else I was doing wrongly. Sorry I
> forgot to remove it.
>>
>> > +tree lhs_type = TREE_TYPE (lhs);
>> > +tree elt_type = TREE_TYPE (lhs_type);
>> > +scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type));
>>
>> Aren't we already dealing with a scalar type here?  I'd have expected
>> SCALAR_TYPE_MODE rather than GET_MODE_INNER (TYPE_MODE ...).
> Ugh, sorry, I had most of the code copied over from svld1rq_impl for
> building VEC_PERM_EXPR with VLA mask and adjusted it,
> but overlooked this :/
>>
>> > +machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
>> > +tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
>> > +
>> > +unsigned nargs = gimple_call_num_args (f.call);
>> > +vec *v;
>> > +vec_alloc (v, nargs);
>> > +for (unsigned i = 0; i < nargs; i++)
>> > +  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i));
>> > +tree vec = build_constructor (vq_type, v);
>> > +
>> > +tree access_type
>> > +  = build_aligned_type (vq_type, TYPE_ALIGN (elt_type));
>>
>> Nit: seems to fit on one line.  But do we need this?  We're not accessing
>> memory, so I'd have expected vq_type to be OK as-is.
>>
>> > +tree tmp = make_ssa_name_fn (cfun, access_type, 0);
>> > +gimple *g = gimple_build_assign (tmp, vec);
>> > +
>> > +gimple_seq stmts = NULL;
>> > +gimple_seq_add_stmt_without_update (&stmts, g);
>> > +
>> > +int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
>>
>> Looks like we should be able to use nargs instead of source_nelts.
> Does the attached patch look OK ?
>
> Thanks,
> Prathamesh
>>
>
>> Thanks,
>> Richard
>>
>> > +poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
>> > +vec_perm_builder sel (lhs_len, source_nelts, 1);
>> > +for (int i = 0; i < source_nelts; i++)
>> > +  sel.quick_push (i);
>> > +
>> > +vec_perm_indices indices (sel, 1, source_nelts);
>> > +tree mask_type = build_vector_type (ssizetype, lhs_len);
>> > +tree mask = vec_perm_indices_to_tree (mask_type, indices);
>> > +
>> > +gimple *g2 = gimple_build_assign (lhs, VEC_PERM_EXPR, tmp, tmp, mask);
>> > +gimple_seq_add_stmt_without_update (&stmts, g2);
>> > +gsi_replace_with_seq (f.gsi, stmts, false);
>> > +return g2;
>> > +  }
>> > +
>> >  public:
>> >gimple *
>> >fold (gimple_folder &f) const override
>> > @@ -832,7 +888,7 @@ public:
>> >{
>> >   tree elt = gimple_call_arg (f.call, i);
>> >   if (!CONSTANT_CLASS_P (elt))
>> > -   return NULL;
>> > +   return fold_nonconst_dupq (f, factor);
>> >   builder.quick_push (elt);
>> >   for (unsigned int j = 1; j < factor; ++j)
>> > builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c 
>> > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
>> > new file mode 100644
>> > index 000..f19f8deb1e5
>> > --- /dev/null
>> > ++

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-04-06 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 4 Apr 2023 at 23:35, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Mon, 13 Mar 2023 at 13:03, Richard Biener  wrote:
> >> On GIMPLE it would be
> >>
> >>  _1 = { a, ... }; // (a)
> >>  _2 = { _1, ... }; // (b)
> >>
> >> but I'm not sure if (b), a VL CTOR of fixed len(?) sub-vectors is
> >> possible?  But at least a CTOR of vectors is what we use to
> >> concat vectors.
> >>
> >> With the recent relaxing of VEC_PERM inputs it's also possible to
> >> express (b) with a VEC_PERM:
> >>
> >>  _2 = VEC_PERM <_1, _1, { 0, 1, 2, 3, 0, 1, 2, 3, ... }>
> >>
> >> but again I'm not sure if that repeating 0, 1, 2, 3 is expressible
> >> for VL vectors (maybe we'd allow "wrapping" here, I'm not sure).
> >>
> > Hi,
> > Thanks for the suggestions and sorry for late response in turn.
> > The attached patch tries to fix the issue by explicitly constructing a CTOR
> > from svdupq's arguments and then using VEC_PERM_EXPR with VL mask
> > having encoded elements {0, 1, ... nargs-1},
> > npatterns == nargs, and nelts_per_pattern == 1, to replicate the base 
> > vector.
> >
> > So for example, for the above case,
> > svint32_t f_32(int32x4_t x)
> > {
> >   return svdupq_s32 (x[0], x[1], x[2], x[3]);
> > }
> >
> > forwprop1 lowers it to:
> >   svint32_t _6;
> >   vector(4) int _8;
> >   :
> >   _1 = BIT_FIELD_REF ;
> >   _2 = BIT_FIELD_REF ;
> >   _3 = BIT_FIELD_REF ;
> >   _4 = BIT_FIELD_REF ;
> >   _8 = {_1, _2, _3, _4};
> >   _6 = VEC_PERM_EXPR <_8, _8, { 0, 1, 2, 3, ... }>;
> >   return _6;
> >
> > which is then eventually optimized to:
> >   svint32_t _6;
> >[local count: 1073741824]:
> >   _6 = VEC_PERM_EXPR ;
> >   return _6;
> >
> > code-gen:
> > f_32:
> > dup z0.q, z0.q[0]
> > ret
>
> Nice!
>
> > Does it look OK ?
> >
> > Thanks,
> > Prathamesh
> >> Richard.
> >>
> >> > We're planning to implement the ACLE's Neon-SVE bridge:
> >> > https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge
> >> > and so we'll need (b) to implement the svdup_neonq functions.
> >> >
> >> > Thanks,
> >> > Richard
> >> >
> >>
> >> --
> >> Richard Biener 
> >> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
> >> Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
> >> HRB 36809 (AG Nuernberg)
> >
> > [SVE] Fold svld1rq to VEC_PERM_EXPR if elements are not constant.
> >
> > gcc/ChangeLog:
> >   * config/aarch64/aarch64-sve-builtins-base.cc
> >   (svdupq_impl::fold_nonconst_dupq): New method.
> >   (svdupq_impl::fold): Call fold_nonconst_dupq.
> >
> > gcc/testsuite/ChangeLog:
> >   * gcc.target/aarch64/sve/acle/general/dupq_11.c: New test.
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index cd9cace3c9b..3de79060619 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -817,6 +817,62 @@ public:
> >
> >  class svdupq_impl : public quiet
> >  {
> > +private:
> > +  gimple *
> > +  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
> > +  {
> > +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
> > +   tmp = {arg0, arg1, ..., arg}
> > +   lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
> > +
> > +/* TODO: Revisit to handle factor by padding zeros.  */
> > +if (factor > 1)
> > +  return NULL;
>
> Isn't the key thing here predicate vs. vector rather than factor == 1 vs.
> factor != 1?  Do we generate good code for b8, where factor should be 1?
Hi,
It generates the following code for svdup_n_b8:
https://pastebin.com/ypYt590c

I suppose lowering to ctor+vec_perm_expr is not really useful
for this case because it won't simplify ctor, unlike the above case of
svdupq_s32 (x[0], x[1], x[2], x[3]);
However I wonder if it's still a good idea to lower svdupq for predicates, for
representing svdupq (or other intrinsics) using GIMPLE constructs as
far as possible ? In the attached patch, it simply punts if the type
suffix is b,
and doesn't try to fold the call.
>
> > +
> > +if (BYTES_BIG_ENDIAN)
> > +  return NULL;
> > +
> > +tree lhs = gimple_call_lhs (f.call);
> > +if (TREE_CODE (lhs) != SSA_NAME)
> > +  return NULL;
>
> Why is this check needed?
This was a left-over from something else I was doing wrongly. Sorry I
forgot to remove it.
>
> > +tree lhs_type = TREE_TYPE (lhs);
> > +tree elt_type = TREE_TYPE (lhs_type);
> > +scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type));
>
> Aren't we already dealing with a scalar type here?  I'd have expected
> SCALAR_TYPE_MODE rather than GET_MODE_INNER (TYPE_MODE ...).
Ugh, sorry, I had most of the code copied over from svld1rq_impl for
building VEC_PERM_EXPR with VLA mask and adjusted it,
but overlooked this :/
>
> > +machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
> > +tree vq_type = build_vector_type_for_mode (elt_type, vq_mode)

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-04-04 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> On Mon, 13 Mar 2023 at 13:03, Richard Biener  wrote:
>> On GIMPLE it would be
>>
>>  _1 = { a, ... }; // (a)
>>  _2 = { _1, ... }; // (b)
>>
>> but I'm not sure if (b), a VL CTOR of fixed len(?) sub-vectors is
>> possible?  But at least a CTOR of vectors is what we use to
>> concat vectors.
>>
>> With the recent relaxing of VEC_PERM inputs it's also possible to
>> express (b) with a VEC_PERM:
>>
>>  _2 = VEC_PERM <_1, _1, { 0, 1, 2, 3, 0, 1, 2, 3, ... }>
>>
>> but again I'm not sure if that repeating 0, 1, 2, 3 is expressible
>> for VL vectors (maybe we'd allow "wrapping" here, I'm not sure).
>>
> Hi,
> Thanks for the suggestions and sorry for late response in turn.
> The attached patch tries to fix the issue by explicitly constructing a CTOR
> from svdupq's arguments and then using VEC_PERM_EXPR with VL mask
> having encoded elements {0, 1, ... nargs-1},
> npatterns == nargs, and nelts_per_pattern == 1, to replicate the base vector.
>
> So for example, for the above case,
> svint32_t f_32(int32x4_t x)
> {
>   return svdupq_s32 (x[0], x[1], x[2], x[3]);
> }
>
> forwprop1 lowers it to:
>   svint32_t _6;
>   vector(4) int _8;
>   :
>   _1 = BIT_FIELD_REF ;
>   _2 = BIT_FIELD_REF ;
>   _3 = BIT_FIELD_REF ;
>   _4 = BIT_FIELD_REF ;
>   _8 = {_1, _2, _3, _4};
>   _6 = VEC_PERM_EXPR <_8, _8, { 0, 1, 2, 3, ... }>;
>   return _6;
>
> which is then eventually optimized to:
>   svint32_t _6;
>[local count: 1073741824]:
>   _6 = VEC_PERM_EXPR ;
>   return _6;
>
> code-gen:
> f_32:
> dup z0.q, z0.q[0]
> ret

Nice!

> Does it look OK ?
>
> Thanks,
> Prathamesh
>> Richard.
>>
>> > We're planning to implement the ACLE's Neon-SVE bridge:
>> > https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge
>> > and so we'll need (b) to implement the svdup_neonq functions.
>> >
>> > Thanks,
>> > Richard
>> >
>>
>> --
>> Richard Biener 
>> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
>> Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
>> HRB 36809 (AG Nuernberg)
>
> [SVE] Fold svld1rq to VEC_PERM_EXPR if elements are not constant.
>
> gcc/ChangeLog:
>   * config/aarch64/aarch64-sve-builtins-base.cc
>   (svdupq_impl::fold_nonconst_dupq): New method.
>   (svdupq_impl::fold): Call fold_nonconst_dupq.
>
> gcc/testsuite/ChangeLog:
>   * gcc.target/aarch64/sve/acle/general/dupq_11.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index cd9cace3c9b..3de79060619 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -817,6 +817,62 @@ public:
>  
>  class svdupq_impl : public quiet
>  {
> +private:
> +  gimple *
> +  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
> +  {
> +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
> +   tmp = {arg0, arg1, ..., arg}
> +   lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
> +
> +/* TODO: Revisit to handle factor by padding zeros.  */
> +if (factor > 1)
> +  return NULL;

Isn't the key thing here predicate vs. vector rather than factor == 1 vs.
factor != 1?  Do we generate good code for b8, where factor should be 1?

> +
> +if (BYTES_BIG_ENDIAN)
> +  return NULL;
> +
> +tree lhs = gimple_call_lhs (f.call);
> +if (TREE_CODE (lhs) != SSA_NAME)
> +  return NULL;

Why is this check needed?

> +tree lhs_type = TREE_TYPE (lhs);
> +tree elt_type = TREE_TYPE (lhs_type);
> +scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type));

Aren't we already dealing with a scalar type here?  I'd have expected
SCALAR_TYPE_MODE rather than GET_MODE_INNER (TYPE_MODE ...).

> +machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
> +tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
> +
> +unsigned nargs = gimple_call_num_args (f.call);
> +vec *v;
> +vec_alloc (v, nargs);
> +for (unsigned i = 0; i < nargs; i++)
> +  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i));
> +tree vec = build_constructor (vq_type, v);
> +
> +tree access_type
> +  = build_aligned_type (vq_type, TYPE_ALIGN (elt_type));

Nit: seems to fit on one line.  But do we need this?  We're not accessing
memory, so I'd have expected vq_type to be OK as-is.

> +tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> +gimple *g = gimple_build_assign (tmp, vec);
> +
> +gimple_seq stmts = NULL;
> +gimple_seq_add_stmt_without_update (&stmts, g);
> +
> +int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();

Looks like we should be able to use nargs instead of source_nelts.

Thanks,
Richard

> +poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> +vec_perm_builder sel (lhs_len, source_nelts, 1);
> +for (int i = 0; i < source_nelts; i++)
> +  sel.quick_push (i);
> +
> + 

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-04-03 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 13 Mar 2023 at 13:03, Richard Biener  wrote:
>
> On Fri, 10 Mar 2023, Richard Sandiford wrote:
>
> > Sorry for the slow reply.
> >
> > Prathamesh Kulkarni  writes:
> > > Unfortunately it regresses code-gen for the following case:
> > >
> > > svint32_t f(int32x4_t x)
> > > {
> > >   return svdupq_s32 (x[0], x[1], x[2], x[3]);
> > > }
> > >
> > > -O2 code-gen with trunk:
> > > f:
> > > dup z0.q, z0.q[0]
> > > ret
> > >
> > > -O2 code-gen with patch:
> > > f:
> > > dup s1, v0.s[1]
> > > movv2.8b, v0.8b
> > > ins v1.s[1], v0.s[3]
> > > ins v2.s[1], v0.s[2]
> > > zip1v0.4s, v2.4s, v1.4s
> > > dup z0.q, z0.q[0]
> > > ret
> > >
> > > IIUC, svdupq_impl::expand uses aarch64_expand_vector_init
> > > to initialize the "base 128-bit vector" and then use dupq to replicate it.
> > >
> > > Without patch, aarch64_expand_vector_init generates fallback code, and 
> > > then
> > > combine optimizes a sequence of vec_merge/vec_select pairs into an 
> > > assignment:
> > >
> > > (insn 7 3 8 2 (set (reg:SI 99)
> > > (vec_select:SI (reg/v:V4SI 97 [ x ])
> > > (parallel [
> > > (const_int 1 [0x1])
> > > ]))) "bar.c":6:10 2592 {aarch64_get_lanev4si}
> > >  (nil))
> > >
> > > (insn 13 9 15 2 (set (reg:V4SI 102)
> > > (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 99))
> > > (reg/v:V4SI 97 [ x ])
> > > (const_int 2 [0x2]))) "bar.c":6:10 1794 
> > > {aarch64_simd_vec_setv4si}
> > >  (expr_list:REG_DEAD (reg:SI 99)
> > > (expr_list:REG_DEAD (reg/v:V4SI 97 [ x ])
> > > (nil
> > >
> > > into:
> > > Trying 7 -> 13:
> > > 7: r99:SI=vec_select(r97:V4SI,parallel)
> > >13: r102:V4SI=vec_merge(vec_duplicate(r99:SI),r97:V4SI,0x2)
> > >   REG_DEAD r99:SI
> > >   REG_DEAD r97:V4SI
> > > Successfully matched this instruction:
> > > (set (reg:V4SI 102)
> > > (reg/v:V4SI 97 [ x ]))
> > >
> > > which eventually results into:
> > > (note 2 25 3 2 NOTE_INSN_DELETED)
> > > (note 3 2 7 2 NOTE_INSN_FUNCTION_BEG)
> > > (note 7 3 8 2 NOTE_INSN_DELETED)
> > > (note 8 7 9 2 NOTE_INSN_DELETED)
> > > (note 9 8 13 2 NOTE_INSN_DELETED)
> > > (note 13 9 15 2 NOTE_INSN_DELETED)
> > > (note 15 13 17 2 NOTE_INSN_DELETED)
> > > (note 17 15 18 2 NOTE_INSN_DELETED)
> > > (note 18 17 22 2 NOTE_INSN_DELETED)
> > > (insn 22 18 23 2 (parallel [
> > > (set (reg/i:VNx4SI 32 v0)
> > > (vec_duplicate:VNx4SI (reg:V4SI 108)))
> > > (clobber (scratch:VNx16BI))
> > > ]) "bar.c":7:1 5202 {aarch64_vec_duplicate_vqvnx4si_le}
> > >  (expr_list:REG_DEAD (reg:V4SI 108)
> > > (nil)))
> > > (insn 23 22 0 2 (use (reg/i:VNx4SI 32 v0)) "bar.c":7:1 -1
> > >  (nil))
> > >
> > > I was wondering if we should add the above special case, of assigning
> > > target = vec in aarch64_expand_vector_init, if initializer is {
> > > vec[0], vec[1], ... } ?
> >
> > I'm not sure it will be easy to detect that.  Won't the inputs to
> > aarch64_expand_vector_init just be plain registers?  It's not a
> > good idea in general to search for definitions of registers
> > during expansion.
> >
> > It would be nice to fix this by lowering svdupq into:
> >
> > (a) a constructor for a 128-bit vector
> > (b) a duplication of the 128-bit vector to fill an SVE vector
> >
> > But I'm not sure what the best way of doing (b) would be.
> > In RTL we can use vec_duplicate, but I don't think gimple
> > has an equivalent construct.  Maybe Richi has some ideas.
>
> On GIMPLE it would be
>
>  _1 = { a, ... }; // (a)
>  _2 = { _1, ... }; // (b)
>
> but I'm not sure if (b), a VL CTOR of fixed len(?) sub-vectors is
> possible?  But at least a CTOR of vectors is what we use to
> concat vectors.
>
> With the recent relaxing of VEC_PERM inputs it's also possible to
> express (b) with a VEC_PERM:
>
>  _2 = VEC_PERM <_1, _1, { 0, 1, 2, 3, 0, 1, 2, 3, ... }>
>
> but again I'm not sure if that repeating 0, 1, 2, 3 is expressible
> for VL vectors (maybe we'd allow "wrapping" here, I'm not sure).
>
Hi,
Thanks for the suggestions and sorry for late response in turn.
The attached patch tries to fix the issue by explicitly constructing a CTOR
from svdupq's arguments and then using VEC_PERM_EXPR with VL mask
having encoded elements {0, 1, ... nargs-1},
npatterns == nargs, and nelts_per_pattern == 1, to replicate the base vector.

So for example, for the above case,
svint32_t f_32(int32x4_t x)
{
  return svdupq_s32 (x[0], x[1], x[2], x[3]);
}

forwprop1 lowers it to:
  svint32_t _6;
  vector(4) int _8;
  :
  _1 = BIT_FIELD_REF ;
  _2 = BIT_FIELD_REF ;
  _3 = BIT_FIELD_REF ;
  _4 = BIT_FIELD_REF ;
  _8 = {_1, _2, _3, _4};
  _6 = VEC_PERM_EXPR <_8, _8, { 0, 1, 2, 3, ... }>;
  return _6;

which is then eventually optimized to:
  svint32_t _6;
   [local count: 1073741824]:
  _6 = VEC_PERM_EXPR ;
  return _6;

code-gen:
f_32:
dup z0.q, 

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-03-13 Thread Richard Biener via Gcc-patches
On Fri, 10 Mar 2023, Richard Sandiford wrote:

> Sorry for the slow reply.
> 
> Prathamesh Kulkarni  writes:
> > Unfortunately it regresses code-gen for the following case:
> >
> > svint32_t f(int32x4_t x)
> > {
> >   return svdupq_s32 (x[0], x[1], x[2], x[3]);
> > }
> >
> > -O2 code-gen with trunk:
> > f:
> > dup z0.q, z0.q[0]
> > ret
> >
> > -O2 code-gen with patch:
> > f:
> > dup s1, v0.s[1]
> > movv2.8b, v0.8b
> > ins v1.s[1], v0.s[3]
> > ins v2.s[1], v0.s[2]
> > zip1v0.4s, v2.4s, v1.4s
> > dup z0.q, z0.q[0]
> > ret
> >
> > IIUC, svdupq_impl::expand uses aarch64_expand_vector_init
> > to initialize the "base 128-bit vector" and then use dupq to replicate it.
> >
> > Without patch, aarch64_expand_vector_init generates fallback code, and then
> > combine optimizes a sequence of vec_merge/vec_select pairs into an 
> > assignment:
> >
> > (insn 7 3 8 2 (set (reg:SI 99)
> > (vec_select:SI (reg/v:V4SI 97 [ x ])
> > (parallel [
> > (const_int 1 [0x1])
> > ]))) "bar.c":6:10 2592 {aarch64_get_lanev4si}
> >  (nil))
> >
> > (insn 13 9 15 2 (set (reg:V4SI 102)
> > (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 99))
> > (reg/v:V4SI 97 [ x ])
> > (const_int 2 [0x2]))) "bar.c":6:10 1794 
> > {aarch64_simd_vec_setv4si}
> >  (expr_list:REG_DEAD (reg:SI 99)
> > (expr_list:REG_DEAD (reg/v:V4SI 97 [ x ])
> > (nil
> >
> > into:
> > Trying 7 -> 13:
> > 7: r99:SI=vec_select(r97:V4SI,parallel)
> >13: r102:V4SI=vec_merge(vec_duplicate(r99:SI),r97:V4SI,0x2)
> >   REG_DEAD r99:SI
> >   REG_DEAD r97:V4SI
> > Successfully matched this instruction:
> > (set (reg:V4SI 102)
> > (reg/v:V4SI 97 [ x ]))
> >
> > which eventually results into:
> > (note 2 25 3 2 NOTE_INSN_DELETED)
> > (note 3 2 7 2 NOTE_INSN_FUNCTION_BEG)
> > (note 7 3 8 2 NOTE_INSN_DELETED)
> > (note 8 7 9 2 NOTE_INSN_DELETED)
> > (note 9 8 13 2 NOTE_INSN_DELETED)
> > (note 13 9 15 2 NOTE_INSN_DELETED)
> > (note 15 13 17 2 NOTE_INSN_DELETED)
> > (note 17 15 18 2 NOTE_INSN_DELETED)
> > (note 18 17 22 2 NOTE_INSN_DELETED)
> > (insn 22 18 23 2 (parallel [
> > (set (reg/i:VNx4SI 32 v0)
> > (vec_duplicate:VNx4SI (reg:V4SI 108)))
> > (clobber (scratch:VNx16BI))
> > ]) "bar.c":7:1 5202 {aarch64_vec_duplicate_vqvnx4si_le}
> >  (expr_list:REG_DEAD (reg:V4SI 108)
> > (nil)))
> > (insn 23 22 0 2 (use (reg/i:VNx4SI 32 v0)) "bar.c":7:1 -1
> >  (nil))
> >
> > I was wondering if we should add the above special case, of assigning
> > target = vec in aarch64_expand_vector_init, if initializer is {
> > vec[0], vec[1], ... } ?
> 
> I'm not sure it will be easy to detect that.  Won't the inputs to
> aarch64_expand_vector_init just be plain registers?  It's not a
> good idea in general to search for definitions of registers
> during expansion.
> 
> It would be nice to fix this by lowering svdupq into:
> 
> (a) a constructor for a 128-bit vector
> (b) a duplication of the 128-bit vector to fill an SVE vector
> 
> But I'm not sure what the best way of doing (b) would be.
> In RTL we can use vec_duplicate, but I don't think gimple
> has an equivalent construct.  Maybe Richi has some ideas.

On GIMPLE it would be

 _1 = { a, ... }; // (a)
 _2 = { _1, ... }; // (b)

but I'm not sure if (b), a VL CTOR of fixed len(?) sub-vectors is
possible?  But at least a CTOR of vectors is what we use to
concat vectors.

With the recent relaxing of VEC_PERM inputs it's also possible to
express (b) with a VEC_PERM:

 _2 = VEC_PERM <_1, _1, { 0, 1, 2, 3, 0, 1, 2, 3, ... }>

but again I'm not sure if that repeating 0, 1, 2, 3 is expressible
for VL vectors (maybe we'd allow "wrapping" here, I'm not sure).

Richard.

> We're planning to implement the ACLE's Neon-SVE bridge:
> https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge
> and so we'll need (b) to implement the svdup_neonq functions.
> 
> Thanks,
> Richard
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)


Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-03-10 Thread Richard Sandiford via Gcc-patches
Sorry for the slow reply.

Prathamesh Kulkarni  writes:
> Unfortunately it regresses code-gen for the following case:
>
> svint32_t f(int32x4_t x)
> {
>   return svdupq_s32 (x[0], x[1], x[2], x[3]);
> }
>
> -O2 code-gen with trunk:
> f:
> dup z0.q, z0.q[0]
> ret
>
> -O2 code-gen with patch:
> f:
> dup s1, v0.s[1]
> movv2.8b, v0.8b
> ins v1.s[1], v0.s[3]
> ins v2.s[1], v0.s[2]
> zip1v0.4s, v2.4s, v1.4s
> dup z0.q, z0.q[0]
> ret
>
> IIUC, svdupq_impl::expand uses aarch64_expand_vector_init
> to initialize the "base 128-bit vector" and then use dupq to replicate it.
>
> Without patch, aarch64_expand_vector_init generates fallback code, and then
> combine optimizes a sequence of vec_merge/vec_select pairs into an assignment:
>
> (insn 7 3 8 2 (set (reg:SI 99)
> (vec_select:SI (reg/v:V4SI 97 [ x ])
> (parallel [
> (const_int 1 [0x1])
> ]))) "bar.c":6:10 2592 {aarch64_get_lanev4si}
>  (nil))
>
> (insn 13 9 15 2 (set (reg:V4SI 102)
> (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 99))
> (reg/v:V4SI 97 [ x ])
> (const_int 2 [0x2]))) "bar.c":6:10 1794 {aarch64_simd_vec_setv4si}
>  (expr_list:REG_DEAD (reg:SI 99)
> (expr_list:REG_DEAD (reg/v:V4SI 97 [ x ])
> (nil
>
> into:
> Trying 7 -> 13:
> 7: r99:SI=vec_select(r97:V4SI,parallel)
>13: r102:V4SI=vec_merge(vec_duplicate(r99:SI),r97:V4SI,0x2)
>   REG_DEAD r99:SI
>   REG_DEAD r97:V4SI
> Successfully matched this instruction:
> (set (reg:V4SI 102)
> (reg/v:V4SI 97 [ x ]))
>
> which eventually results into:
> (note 2 25 3 2 NOTE_INSN_DELETED)
> (note 3 2 7 2 NOTE_INSN_FUNCTION_BEG)
> (note 7 3 8 2 NOTE_INSN_DELETED)
> (note 8 7 9 2 NOTE_INSN_DELETED)
> (note 9 8 13 2 NOTE_INSN_DELETED)
> (note 13 9 15 2 NOTE_INSN_DELETED)
> (note 15 13 17 2 NOTE_INSN_DELETED)
> (note 17 15 18 2 NOTE_INSN_DELETED)
> (note 18 17 22 2 NOTE_INSN_DELETED)
> (insn 22 18 23 2 (parallel [
> (set (reg/i:VNx4SI 32 v0)
> (vec_duplicate:VNx4SI (reg:V4SI 108)))
> (clobber (scratch:VNx16BI))
> ]) "bar.c":7:1 5202 {aarch64_vec_duplicate_vqvnx4si_le}
>  (expr_list:REG_DEAD (reg:V4SI 108)
> (nil)))
> (insn 23 22 0 2 (use (reg/i:VNx4SI 32 v0)) "bar.c":7:1 -1
>  (nil))
>
> I was wondering if we should add the above special case, of assigning
> target = vec in aarch64_expand_vector_init, if initializer is {
> vec[0], vec[1], ... } ?

I'm not sure it will be easy to detect that.  Won't the inputs to
aarch64_expand_vector_init just be plain registers?  It's not a
good idea in general to search for definitions of registers
during expansion.

It would be nice to fix this by lowering svdupq into:

(a) a constructor for a 128-bit vector
(b) a duplication of the 128-bit vector to fill an SVE vector

But I'm not sure what the best way of doing (b) would be.
In RTL we can use vec_duplicate, but I don't think gimple
has an equivalent construct.  Maybe Richi has some ideas.

We're planning to implement the ACLE's Neon-SVE bridge:
https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge
and so we'll need (b) to implement the svdup_neonq functions.

Thanks,
Richard


Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-02-11 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 6 Feb 2023 at 17:43, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Fri, 3 Feb 2023 at 20:47, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni
> >> >  wrote:
> >> >>
> >> >> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford
> >> >>  wrote:
> >> >> >
> >> >> > Prathamesh Kulkarni  writes:
> >> >> > >> >> > I have attached a patch that extends the transform if one 
> >> >> > >> >> > half is dup
> >> >> > >> >> > and other is set of constants.
> >> >> > >> >> > For eg:
> >> >> > >> >> > int8x16_t f(int8_t x)
> >> >> > >> >> > {
> >> >> > >> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 
> >> >> > >> >> > 7, x, 8 };
> >> >> > >> >> > }
> >> >> > >> >> >
> >> >> > >> >> > code-gen trunk:
> >> >> > >> >> > f:
> >> >> > >> >> > adrpx1, .LC0
> >> >> > >> >> > ldr q0, [x1, #:lo12:.LC0]
> >> >> > >> >> > ins v0.b[0], w0
> >> >> > >> >> > ins v0.b[2], w0
> >> >> > >> >> > ins v0.b[4], w0
> >> >> > >> >> > ins v0.b[6], w0
> >> >> > >> >> > ins v0.b[8], w0
> >> >> > >> >> > ins v0.b[10], w0
> >> >> > >> >> > ins v0.b[12], w0
> >> >> > >> >> > ins v0.b[14], w0
> >> >> > >> >> > ret
> >> >> > >> >> >
> >> >> > >> >> > code-gen with patch:
> >> >> > >> >> > f:
> >> >> > >> >> > dup v0.16b, w0
> >> >> > >> >> > adrpx0, .LC0
> >> >> > >> >> > ldr q1, [x0, #:lo12:.LC0]
> >> >> > >> >> > zip1v0.16b, v0.16b, v1.16b
> >> >> > >> >> > ret
> >> >> > >> >> >
> >> >> > >> >> > Bootstrapped+tested on aarch64-linux-gnu.
> >> >> > >> >> > Does it look OK ?
> >> >> > >> >>
> >> >> > >> >> Looks like a nice improvement.  It'll need to wait for GCC 14 
> >> >> > >> >> now though.
> >> >> > >> >>
> >> >> > >> >> However, rather than handle this case specially, I think we 
> >> >> > >> >> should instead
> >> >> > >> >> take a divide-and-conquer approach: split the initialiser into 
> >> >> > >> >> even and
> >> >> > >> >> odd elements, find the best way of loading each part, then 
> >> >> > >> >> compare the
> >> >> > >> >> cost of these sequences + ZIP with the cost of the fallback 
> >> >> > >> >> code (the code
> >> >> > >> >> later in aarch64_expand_vector_init).
> >> >> > >> >>
> >> >> > >> >> For example, doing that would allow:
> >> >> > >> >>
> >> >> > >> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
> >> >> > >> >>
> >> >> > >> >> to be loaded more easily, even though the even elements aren't 
> >> >> > >> >> wholly
> >> >> > >> >> constant.
> >> >> > >> > Hi Richard,
> >> >> > >> > I have attached a prototype patch based on the above approach.
> >> >> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case 
> >> >> > >> > by generating
> >> >> > >> > same sequence, thus I removed that hunk, and improves the 
> >> >> > >> > following cases:
> >> >> > >> >
> >> >> > >> > (a)
> >> >> > >> > int8x16_t f_s16(int8_t x)
> >> >> > >> > {
> >> >> > >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> >> >> > >> >  x, 5, x, 6, x, 7, x, 8 };
> >> >> > >> > }
> >> >> > >> >
> >> >> > >> > code-gen trunk:
> >> >> > >> > f_s16:
> >> >> > >> > adrpx1, .LC0
> >> >> > >> > ldr q0, [x1, #:lo12:.LC0]
> >> >> > >> > ins v0.b[0], w0
> >> >> > >> > ins v0.b[2], w0
> >> >> > >> > ins v0.b[4], w0
> >> >> > >> > ins v0.b[6], w0
> >> >> > >> > ins v0.b[8], w0
> >> >> > >> > ins v0.b[10], w0
> >> >> > >> > ins v0.b[12], w0
> >> >> > >> > ins v0.b[14], w0
> >> >> > >> > ret
> >> >> > >> >
> >> >> > >> > code-gen with patch:
> >> >> > >> > f_s16:
> >> >> > >> > dup v0.16b, w0
> >> >> > >> > adrpx0, .LC0
> >> >> > >> > ldr q1, [x0, #:lo12:.LC0]
> >> >> > >> > zip1v0.16b, v0.16b, v1.16b
> >> >> > >> > ret
> >> >> > >> >
> >> >> > >> > (b)
> >> >> > >> > int8x16_t f_s16(int8_t x, int8_t y)
> >> >> > >> > {
> >> >> > >> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> >> >> > >> > 4, y, 5, y, 6, y, 7, y };
> >> >> > >> > }
> >> >> > >> >
> >> >> > >> > code-gen trunk:
> >> >> > >> > f_s16:
> >> >> > >> > adrpx2, .LC0
> >> >> > >> > ldr q0, [x2, #:lo12:.LC0]
> >> >> > >> > ins v0.b[0], w0
> >> >> > >> > ins v0.b[1], w1
> >> >> > >> > ins v0.b[3], w1
> >> >> > >> > ins v0.b[5], w1
> >> >> > >> > ins v0.b[7], w1
> >> >> > >> > ins v0.b[9], w1
> >> >> > >> > ins v0.b[11], w1
> >> >> > >> > ins v0.b[13], w1
> >> >> > >> > ins v0.b[15], w1
> >> >> > >> > ret
> >> >> > >> >
> >> >> > >> > code-gen patch:
> >> >> > >> > f_s16:
> >> >> > >> > adrpx2, .LC0
> >> >> > >> 

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-02-06 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> On Fri, 3 Feb 2023 at 20:47, Richard Sandiford
>  wrote:
>>
>> Prathamesh Kulkarni  writes:
>> > On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni
>> >  wrote:
>> >>
>> >> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford
>> >>  wrote:
>> >> >
>> >> > Prathamesh Kulkarni  writes:
>> >> > >> >> > I have attached a patch that extends the transform if one half 
>> >> > >> >> > is dup
>> >> > >> >> > and other is set of constants.
>> >> > >> >> > For eg:
>> >> > >> >> > int8x16_t f(int8_t x)
>> >> > >> >> > {
>> >> > >> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 
>> >> > >> >> > 7, x, 8 };
>> >> > >> >> > }
>> >> > >> >> >
>> >> > >> >> > code-gen trunk:
>> >> > >> >> > f:
>> >> > >> >> > adrpx1, .LC0
>> >> > >> >> > ldr q0, [x1, #:lo12:.LC0]
>> >> > >> >> > ins v0.b[0], w0
>> >> > >> >> > ins v0.b[2], w0
>> >> > >> >> > ins v0.b[4], w0
>> >> > >> >> > ins v0.b[6], w0
>> >> > >> >> > ins v0.b[8], w0
>> >> > >> >> > ins v0.b[10], w0
>> >> > >> >> > ins v0.b[12], w0
>> >> > >> >> > ins v0.b[14], w0
>> >> > >> >> > ret
>> >> > >> >> >
>> >> > >> >> > code-gen with patch:
>> >> > >> >> > f:
>> >> > >> >> > dup v0.16b, w0
>> >> > >> >> > adrpx0, .LC0
>> >> > >> >> > ldr q1, [x0, #:lo12:.LC0]
>> >> > >> >> > zip1v0.16b, v0.16b, v1.16b
>> >> > >> >> > ret
>> >> > >> >> >
>> >> > >> >> > Bootstrapped+tested on aarch64-linux-gnu.
>> >> > >> >> > Does it look OK ?
>> >> > >> >>
>> >> > >> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now 
>> >> > >> >> though.
>> >> > >> >>
>> >> > >> >> However, rather than handle this case specially, I think we 
>> >> > >> >> should instead
>> >> > >> >> take a divide-and-conquer approach: split the initialiser into 
>> >> > >> >> even and
>> >> > >> >> odd elements, find the best way of loading each part, then 
>> >> > >> >> compare the
>> >> > >> >> cost of these sequences + ZIP with the cost of the fallback code 
>> >> > >> >> (the code
>> >> > >> >> later in aarch64_expand_vector_init).
>> >> > >> >>
>> >> > >> >> For example, doing that would allow:
>> >> > >> >>
>> >> > >> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
>> >> > >> >>
>> >> > >> >> to be loaded more easily, even though the even elements aren't 
>> >> > >> >> wholly
>> >> > >> >> constant.
>> >> > >> > Hi Richard,
>> >> > >> > I have attached a prototype patch based on the above approach.
>> >> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case 
>> >> > >> > by generating
>> >> > >> > same sequence, thus I removed that hunk, and improves the 
>> >> > >> > following cases:
>> >> > >> >
>> >> > >> > (a)
>> >> > >> > int8x16_t f_s16(int8_t x)
>> >> > >> > {
>> >> > >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
>> >> > >> >  x, 5, x, 6, x, 7, x, 8 };
>> >> > >> > }
>> >> > >> >
>> >> > >> > code-gen trunk:
>> >> > >> > f_s16:
>> >> > >> > adrpx1, .LC0
>> >> > >> > ldr q0, [x1, #:lo12:.LC0]
>> >> > >> > ins v0.b[0], w0
>> >> > >> > ins v0.b[2], w0
>> >> > >> > ins v0.b[4], w0
>> >> > >> > ins v0.b[6], w0
>> >> > >> > ins v0.b[8], w0
>> >> > >> > ins v0.b[10], w0
>> >> > >> > ins v0.b[12], w0
>> >> > >> > ins v0.b[14], w0
>> >> > >> > ret
>> >> > >> >
>> >> > >> > code-gen with patch:
>> >> > >> > f_s16:
>> >> > >> > dup v0.16b, w0
>> >> > >> > adrpx0, .LC0
>> >> > >> > ldr q1, [x0, #:lo12:.LC0]
>> >> > >> > zip1v0.16b, v0.16b, v1.16b
>> >> > >> > ret
>> >> > >> >
>> >> > >> > (b)
>> >> > >> > int8x16_t f_s16(int8_t x, int8_t y)
>> >> > >> > {
>> >> > >> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
>> >> > >> > 4, y, 5, y, 6, y, 7, y };
>> >> > >> > }
>> >> > >> >
>> >> > >> > code-gen trunk:
>> >> > >> > f_s16:
>> >> > >> > adrpx2, .LC0
>> >> > >> > ldr q0, [x2, #:lo12:.LC0]
>> >> > >> > ins v0.b[0], w0
>> >> > >> > ins v0.b[1], w1
>> >> > >> > ins v0.b[3], w1
>> >> > >> > ins v0.b[5], w1
>> >> > >> > ins v0.b[7], w1
>> >> > >> > ins v0.b[9], w1
>> >> > >> > ins v0.b[11], w1
>> >> > >> > ins v0.b[13], w1
>> >> > >> > ins v0.b[15], w1
>> >> > >> > ret
>> >> > >> >
>> >> > >> > code-gen patch:
>> >> > >> > f_s16:
>> >> > >> > adrpx2, .LC0
>> >> > >> > dup v1.16b, w1
>> >> > >> > ldr q0, [x2, #:lo12:.LC0]
>> >> > >> > ins v0.b[0], w0
>> >> > >> > zip1v0.16b, v0.16b, v1.16b
>> >> > >> > ret
>> >> > >>
>> >> > >> Nice.
>> >> > >>
>> >> > >> > There are a couple of issues I have come across:
>> >> > >> > (1)

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-02-03 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 3 Feb 2023 at 20:47, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni
> >  wrote:
> >>
> >> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford
> >>  wrote:
> >> >
> >> > Prathamesh Kulkarni  writes:
> >> > >> >> > I have attached a patch that extends the transform if one half 
> >> > >> >> > is dup
> >> > >> >> > and other is set of constants.
> >> > >> >> > For eg:
> >> > >> >> > int8x16_t f(int8_t x)
> >> > >> >> > {
> >> > >> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, 
> >> > >> >> > x, 8 };
> >> > >> >> > }
> >> > >> >> >
> >> > >> >> > code-gen trunk:
> >> > >> >> > f:
> >> > >> >> > adrpx1, .LC0
> >> > >> >> > ldr q0, [x1, #:lo12:.LC0]
> >> > >> >> > ins v0.b[0], w0
> >> > >> >> > ins v0.b[2], w0
> >> > >> >> > ins v0.b[4], w0
> >> > >> >> > ins v0.b[6], w0
> >> > >> >> > ins v0.b[8], w0
> >> > >> >> > ins v0.b[10], w0
> >> > >> >> > ins v0.b[12], w0
> >> > >> >> > ins v0.b[14], w0
> >> > >> >> > ret
> >> > >> >> >
> >> > >> >> > code-gen with patch:
> >> > >> >> > f:
> >> > >> >> > dup v0.16b, w0
> >> > >> >> > adrpx0, .LC0
> >> > >> >> > ldr q1, [x0, #:lo12:.LC0]
> >> > >> >> > zip1v0.16b, v0.16b, v1.16b
> >> > >> >> > ret
> >> > >> >> >
> >> > >> >> > Bootstrapped+tested on aarch64-linux-gnu.
> >> > >> >> > Does it look OK ?
> >> > >> >>
> >> > >> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now 
> >> > >> >> though.
> >> > >> >>
> >> > >> >> However, rather than handle this case specially, I think we should 
> >> > >> >> instead
> >> > >> >> take a divide-and-conquer approach: split the initialiser into 
> >> > >> >> even and
> >> > >> >> odd elements, find the best way of loading each part, then compare 
> >> > >> >> the
> >> > >> >> cost of these sequences + ZIP with the cost of the fallback code 
> >> > >> >> (the code
> >> > >> >> later in aarch64_expand_vector_init).
> >> > >> >>
> >> > >> >> For example, doing that would allow:
> >> > >> >>
> >> > >> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
> >> > >> >>
> >> > >> >> to be loaded more easily, even though the even elements aren't 
> >> > >> >> wholly
> >> > >> >> constant.
> >> > >> > Hi Richard,
> >> > >> > I have attached a prototype patch based on the above approach.
> >> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by 
> >> > >> > generating
> >> > >> > same sequence, thus I removed that hunk, and improves the following 
> >> > >> > cases:
> >> > >> >
> >> > >> > (a)
> >> > >> > int8x16_t f_s16(int8_t x)
> >> > >> > {
> >> > >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> >> > >> >  x, 5, x, 6, x, 7, x, 8 };
> >> > >> > }
> >> > >> >
> >> > >> > code-gen trunk:
> >> > >> > f_s16:
> >> > >> > adrpx1, .LC0
> >> > >> > ldr q0, [x1, #:lo12:.LC0]
> >> > >> > ins v0.b[0], w0
> >> > >> > ins v0.b[2], w0
> >> > >> > ins v0.b[4], w0
> >> > >> > ins v0.b[6], w0
> >> > >> > ins v0.b[8], w0
> >> > >> > ins v0.b[10], w0
> >> > >> > ins v0.b[12], w0
> >> > >> > ins v0.b[14], w0
> >> > >> > ret
> >> > >> >
> >> > >> > code-gen with patch:
> >> > >> > f_s16:
> >> > >> > dup v0.16b, w0
> >> > >> > adrpx0, .LC0
> >> > >> > ldr q1, [x0, #:lo12:.LC0]
> >> > >> > zip1v0.16b, v0.16b, v1.16b
> >> > >> > ret
> >> > >> >
> >> > >> > (b)
> >> > >> > int8x16_t f_s16(int8_t x, int8_t y)
> >> > >> > {
> >> > >> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> >> > >> > 4, y, 5, y, 6, y, 7, y };
> >> > >> > }
> >> > >> >
> >> > >> > code-gen trunk:
> >> > >> > f_s16:
> >> > >> > adrpx2, .LC0
> >> > >> > ldr q0, [x2, #:lo12:.LC0]
> >> > >> > ins v0.b[0], w0
> >> > >> > ins v0.b[1], w1
> >> > >> > ins v0.b[3], w1
> >> > >> > ins v0.b[5], w1
> >> > >> > ins v0.b[7], w1
> >> > >> > ins v0.b[9], w1
> >> > >> > ins v0.b[11], w1
> >> > >> > ins v0.b[13], w1
> >> > >> > ins v0.b[15], w1
> >> > >> > ret
> >> > >> >
> >> > >> > code-gen patch:
> >> > >> > f_s16:
> >> > >> > adrpx2, .LC0
> >> > >> > dup v1.16b, w1
> >> > >> > ldr q0, [x2, #:lo12:.LC0]
> >> > >> > ins v0.b[0], w0
> >> > >> > zip1v0.16b, v0.16b, v1.16b
> >> > >> > ret
> >> > >>
> >> > >> Nice.
> >> > >>
> >> > >> > There are a couple of issues I have come across:
> >> > >> > (1) Choosing element to pad vector.
> >> > >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, 
> >> > >> > y }
> >> > >> > with mode V8HI.
> >> > >> 

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-02-03 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni
>  wrote:
>>
>> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford
>>  wrote:
>> >
>> > Prathamesh Kulkarni  writes:
>> > >> >> > I have attached a patch that extends the transform if one half is 
>> > >> >> > dup
>> > >> >> > and other is set of constants.
>> > >> >> > For eg:
>> > >> >> > int8x16_t f(int8_t x)
>> > >> >> > {
>> > >> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, 
>> > >> >> > x, 8 };
>> > >> >> > }
>> > >> >> >
>> > >> >> > code-gen trunk:
>> > >> >> > f:
>> > >> >> > adrpx1, .LC0
>> > >> >> > ldr q0, [x1, #:lo12:.LC0]
>> > >> >> > ins v0.b[0], w0
>> > >> >> > ins v0.b[2], w0
>> > >> >> > ins v0.b[4], w0
>> > >> >> > ins v0.b[6], w0
>> > >> >> > ins v0.b[8], w0
>> > >> >> > ins v0.b[10], w0
>> > >> >> > ins v0.b[12], w0
>> > >> >> > ins v0.b[14], w0
>> > >> >> > ret
>> > >> >> >
>> > >> >> > code-gen with patch:
>> > >> >> > f:
>> > >> >> > dup v0.16b, w0
>> > >> >> > adrpx0, .LC0
>> > >> >> > ldr q1, [x0, #:lo12:.LC0]
>> > >> >> > zip1v0.16b, v0.16b, v1.16b
>> > >> >> > ret
>> > >> >> >
>> > >> >> > Bootstrapped+tested on aarch64-linux-gnu.
>> > >> >> > Does it look OK ?
>> > >> >>
>> > >> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now 
>> > >> >> though.
>> > >> >>
>> > >> >> However, rather than handle this case specially, I think we should 
>> > >> >> instead
>> > >> >> take a divide-and-conquer approach: split the initialiser into even 
>> > >> >> and
>> > >> >> odd elements, find the best way of loading each part, then compare 
>> > >> >> the
>> > >> >> cost of these sequences + ZIP with the cost of the fallback code 
>> > >> >> (the code
>> > >> >> later in aarch64_expand_vector_init).
>> > >> >>
>> > >> >> For example, doing that would allow:
>> > >> >>
>> > >> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
>> > >> >>
>> > >> >> to be loaded more easily, even though the even elements aren't wholly
>> > >> >> constant.
>> > >> > Hi Richard,
>> > >> > I have attached a prototype patch based on the above approach.
>> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by 
>> > >> > generating
>> > >> > same sequence, thus I removed that hunk, and improves the following 
>> > >> > cases:
>> > >> >
>> > >> > (a)
>> > >> > int8x16_t f_s16(int8_t x)
>> > >> > {
>> > >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
>> > >> >  x, 5, x, 6, x, 7, x, 8 };
>> > >> > }
>> > >> >
>> > >> > code-gen trunk:
>> > >> > f_s16:
>> > >> > adrpx1, .LC0
>> > >> > ldr q0, [x1, #:lo12:.LC0]
>> > >> > ins v0.b[0], w0
>> > >> > ins v0.b[2], w0
>> > >> > ins v0.b[4], w0
>> > >> > ins v0.b[6], w0
>> > >> > ins v0.b[8], w0
>> > >> > ins v0.b[10], w0
>> > >> > ins v0.b[12], w0
>> > >> > ins v0.b[14], w0
>> > >> > ret
>> > >> >
>> > >> > code-gen with patch:
>> > >> > f_s16:
>> > >> > dup v0.16b, w0
>> > >> > adrpx0, .LC0
>> > >> > ldr q1, [x0, #:lo12:.LC0]
>> > >> > zip1v0.16b, v0.16b, v1.16b
>> > >> > ret
>> > >> >
>> > >> > (b)
>> > >> > int8x16_t f_s16(int8_t x, int8_t y)
>> > >> > {
>> > >> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
>> > >> > 4, y, 5, y, 6, y, 7, y };
>> > >> > }
>> > >> >
>> > >> > code-gen trunk:
>> > >> > f_s16:
>> > >> > adrpx2, .LC0
>> > >> > ldr q0, [x2, #:lo12:.LC0]
>> > >> > ins v0.b[0], w0
>> > >> > ins v0.b[1], w1
>> > >> > ins v0.b[3], w1
>> > >> > ins v0.b[5], w1
>> > >> > ins v0.b[7], w1
>> > >> > ins v0.b[9], w1
>> > >> > ins v0.b[11], w1
>> > >> > ins v0.b[13], w1
>> > >> > ins v0.b[15], w1
>> > >> > ret
>> > >> >
>> > >> > code-gen patch:
>> > >> > f_s16:
>> > >> > adrpx2, .LC0
>> > >> > dup v1.16b, w1
>> > >> > ldr q0, [x2, #:lo12:.LC0]
>> > >> > ins v0.b[0], w0
>> > >> > zip1v0.16b, v0.16b, v1.16b
>> > >> > ret
>> > >>
>> > >> Nice.
>> > >>
>> > >> > There are a couple of issues I have come across:
>> > >> > (1) Choosing element to pad vector.
>> > >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y 
>> > >> > }
>> > >> > with mode V8HI.
>> > >> > We split it into { x, 0, 1, 2 } and { y, y, y, y}
>> > >> > However since the mode is V8HI, we would need to pad the above split 
>> > >> > vectors
>> > >> > with 4 more elements to match up to vector length.
>> > >> > For {x, 0, 1, 2} using any constant is the obvious choice while for 
>> > >> > {y, y, y, y}
>> > >> > using 'y' is the obvio

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-02-02 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni
 wrote:
>
> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford
>  wrote:
> >
> > Prathamesh Kulkarni  writes:
> > >> >> > I have attached a patch that extends the transform if one half is 
> > >> >> > dup
> > >> >> > and other is set of constants.
> > >> >> > For eg:
> > >> >> > int8x16_t f(int8_t x)
> > >> >> > {
> > >> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 
> > >> >> > 8 };
> > >> >> > }
> > >> >> >
> > >> >> > code-gen trunk:
> > >> >> > f:
> > >> >> > adrpx1, .LC0
> > >> >> > ldr q0, [x1, #:lo12:.LC0]
> > >> >> > ins v0.b[0], w0
> > >> >> > ins v0.b[2], w0
> > >> >> > ins v0.b[4], w0
> > >> >> > ins v0.b[6], w0
> > >> >> > ins v0.b[8], w0
> > >> >> > ins v0.b[10], w0
> > >> >> > ins v0.b[12], w0
> > >> >> > ins v0.b[14], w0
> > >> >> > ret
> > >> >> >
> > >> >> > code-gen with patch:
> > >> >> > f:
> > >> >> > dup v0.16b, w0
> > >> >> > adrpx0, .LC0
> > >> >> > ldr q1, [x0, #:lo12:.LC0]
> > >> >> > zip1v0.16b, v0.16b, v1.16b
> > >> >> > ret
> > >> >> >
> > >> >> > Bootstrapped+tested on aarch64-linux-gnu.
> > >> >> > Does it look OK ?
> > >> >>
> > >> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now 
> > >> >> though.
> > >> >>
> > >> >> However, rather than handle this case specially, I think we should 
> > >> >> instead
> > >> >> take a divide-and-conquer approach: split the initialiser into even 
> > >> >> and
> > >> >> odd elements, find the best way of loading each part, then compare the
> > >> >> cost of these sequences + ZIP with the cost of the fallback code (the 
> > >> >> code
> > >> >> later in aarch64_expand_vector_init).
> > >> >>
> > >> >> For example, doing that would allow:
> > >> >>
> > >> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
> > >> >>
> > >> >> to be loaded more easily, even though the even elements aren't wholly
> > >> >> constant.
> > >> > Hi Richard,
> > >> > I have attached a prototype patch based on the above approach.
> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by 
> > >> > generating
> > >> > same sequence, thus I removed that hunk, and improves the following 
> > >> > cases:
> > >> >
> > >> > (a)
> > >> > int8x16_t f_s16(int8_t x)
> > >> > {
> > >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> > >> >  x, 5, x, 6, x, 7, x, 8 };
> > >> > }
> > >> >
> > >> > code-gen trunk:
> > >> > f_s16:
> > >> > adrpx1, .LC0
> > >> > ldr q0, [x1, #:lo12:.LC0]
> > >> > ins v0.b[0], w0
> > >> > ins v0.b[2], w0
> > >> > ins v0.b[4], w0
> > >> > ins v0.b[6], w0
> > >> > ins v0.b[8], w0
> > >> > ins v0.b[10], w0
> > >> > ins v0.b[12], w0
> > >> > ins v0.b[14], w0
> > >> > ret
> > >> >
> > >> > code-gen with patch:
> > >> > f_s16:
> > >> > dup v0.16b, w0
> > >> > adrpx0, .LC0
> > >> > ldr q1, [x0, #:lo12:.LC0]
> > >> > zip1v0.16b, v0.16b, v1.16b
> > >> > ret
> > >> >
> > >> > (b)
> > >> > int8x16_t f_s16(int8_t x, int8_t y)
> > >> > {
> > >> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> > >> > 4, y, 5, y, 6, y, 7, y };
> > >> > }
> > >> >
> > >> > code-gen trunk:
> > >> > f_s16:
> > >> > adrpx2, .LC0
> > >> > ldr q0, [x2, #:lo12:.LC0]
> > >> > ins v0.b[0], w0
> > >> > ins v0.b[1], w1
> > >> > ins v0.b[3], w1
> > >> > ins v0.b[5], w1
> > >> > ins v0.b[7], w1
> > >> > ins v0.b[9], w1
> > >> > ins v0.b[11], w1
> > >> > ins v0.b[13], w1
> > >> > ins v0.b[15], w1
> > >> > ret
> > >> >
> > >> > code-gen patch:
> > >> > f_s16:
> > >> > adrpx2, .LC0
> > >> > dup v1.16b, w1
> > >> > ldr q0, [x2, #:lo12:.LC0]
> > >> > ins v0.b[0], w0
> > >> > zip1v0.16b, v0.16b, v1.16b
> > >> > ret
> > >>
> > >> Nice.
> > >>
> > >> > There are a couple of issues I have come across:
> > >> > (1) Choosing element to pad vector.
> > >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
> > >> > with mode V8HI.
> > >> > We split it into { x, 0, 1, 2 } and { y, y, y, y}
> > >> > However since the mode is V8HI, we would need to pad the above split 
> > >> > vectors
> > >> > with 4 more elements to match up to vector length.
> > >> > For {x, 0, 1, 2} using any constant is the obvious choice while for 
> > >> > {y, y, y, y}
> > >> > using 'y' is the obvious choice thus making them:
> > >> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
> > >> > These would be then merged using zip1 which would discard the lower 
> > >> > half
> > >> > of

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-02-02 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 2 Feb 2023 at 20:50, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> >> >> > I have attached a patch that extends the transform if one half is dup
> >> >> > and other is set of constants.
> >> >> > For eg:
> >> >> > int8x16_t f(int8_t x)
> >> >> > {
> >> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 
> >> >> > };
> >> >> > }
> >> >> >
> >> >> > code-gen trunk:
> >> >> > f:
> >> >> > adrpx1, .LC0
> >> >> > ldr q0, [x1, #:lo12:.LC0]
> >> >> > ins v0.b[0], w0
> >> >> > ins v0.b[2], w0
> >> >> > ins v0.b[4], w0
> >> >> > ins v0.b[6], w0
> >> >> > ins v0.b[8], w0
> >> >> > ins v0.b[10], w0
> >> >> > ins v0.b[12], w0
> >> >> > ins v0.b[14], w0
> >> >> > ret
> >> >> >
> >> >> > code-gen with patch:
> >> >> > f:
> >> >> > dup v0.16b, w0
> >> >> > adrpx0, .LC0
> >> >> > ldr q1, [x0, #:lo12:.LC0]
> >> >> > zip1v0.16b, v0.16b, v1.16b
> >> >> > ret
> >> >> >
> >> >> > Bootstrapped+tested on aarch64-linux-gnu.
> >> >> > Does it look OK ?
> >> >>
> >> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now 
> >> >> though.
> >> >>
> >> >> However, rather than handle this case specially, I think we should 
> >> >> instead
> >> >> take a divide-and-conquer approach: split the initialiser into even and
> >> >> odd elements, find the best way of loading each part, then compare the
> >> >> cost of these sequences + ZIP with the cost of the fallback code (the 
> >> >> code
> >> >> later in aarch64_expand_vector_init).
> >> >>
> >> >> For example, doing that would allow:
> >> >>
> >> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
> >> >>
> >> >> to be loaded more easily, even though the even elements aren't wholly
> >> >> constant.
> >> > Hi Richard,
> >> > I have attached a prototype patch based on the above approach.
> >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by 
> >> > generating
> >> > same sequence, thus I removed that hunk, and improves the following 
> >> > cases:
> >> >
> >> > (a)
> >> > int8x16_t f_s16(int8_t x)
> >> > {
> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> >> >  x, 5, x, 6, x, 7, x, 8 };
> >> > }
> >> >
> >> > code-gen trunk:
> >> > f_s16:
> >> > adrpx1, .LC0
> >> > ldr q0, [x1, #:lo12:.LC0]
> >> > ins v0.b[0], w0
> >> > ins v0.b[2], w0
> >> > ins v0.b[4], w0
> >> > ins v0.b[6], w0
> >> > ins v0.b[8], w0
> >> > ins v0.b[10], w0
> >> > ins v0.b[12], w0
> >> > ins v0.b[14], w0
> >> > ret
> >> >
> >> > code-gen with patch:
> >> > f_s16:
> >> > dup v0.16b, w0
> >> > adrpx0, .LC0
> >> > ldr q1, [x0, #:lo12:.LC0]
> >> > zip1v0.16b, v0.16b, v1.16b
> >> > ret
> >> >
> >> > (b)
> >> > int8x16_t f_s16(int8_t x, int8_t y)
> >> > {
> >> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> >> > 4, y, 5, y, 6, y, 7, y };
> >> > }
> >> >
> >> > code-gen trunk:
> >> > f_s16:
> >> > adrpx2, .LC0
> >> > ldr q0, [x2, #:lo12:.LC0]
> >> > ins v0.b[0], w0
> >> > ins v0.b[1], w1
> >> > ins v0.b[3], w1
> >> > ins v0.b[5], w1
> >> > ins v0.b[7], w1
> >> > ins v0.b[9], w1
> >> > ins v0.b[11], w1
> >> > ins v0.b[13], w1
> >> > ins v0.b[15], w1
> >> > ret
> >> >
> >> > code-gen patch:
> >> > f_s16:
> >> > adrpx2, .LC0
> >> > dup v1.16b, w1
> >> > ldr q0, [x2, #:lo12:.LC0]
> >> > ins v0.b[0], w0
> >> > zip1v0.16b, v0.16b, v1.16b
> >> > ret
> >>
> >> Nice.
> >>
> >> > There are a couple of issues I have come across:
> >> > (1) Choosing element to pad vector.
> >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
> >> > with mode V8HI.
> >> > We split it into { x, 0, 1, 2 } and { y, y, y, y}
> >> > However since the mode is V8HI, we would need to pad the above split 
> >> > vectors
> >> > with 4 more elements to match up to vector length.
> >> > For {x, 0, 1, 2} using any constant is the obvious choice while for {y, 
> >> > y, y, y}
> >> > using 'y' is the obvious choice thus making them:
> >> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
> >> > These would be then merged using zip1 which would discard the lower half
> >> > of both vectors.
> >> > Currently I encoded the above two heuristics in
> >> > aarch64_expand_vector_init_get_padded_elem:
> >> > (a) If split portion contains a constant, use the constant to pad the 
> >> > vector.
> >> > (b) If split portion only contains variables, then use the most
> >> > frequently repeating variable
> >> > to pad the vector.
> >> > I suppose tho 

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-02-02 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
>> >> > I have attached a patch that extends the transform if one half is dup
>> >> > and other is set of constants.
>> >> > For eg:
>> >> > int8x16_t f(int8_t x)
>> >> > {
>> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
>> >> > }
>> >> >
>> >> > code-gen trunk:
>> >> > f:
>> >> > adrpx1, .LC0
>> >> > ldr q0, [x1, #:lo12:.LC0]
>> >> > ins v0.b[0], w0
>> >> > ins v0.b[2], w0
>> >> > ins v0.b[4], w0
>> >> > ins v0.b[6], w0
>> >> > ins v0.b[8], w0
>> >> > ins v0.b[10], w0
>> >> > ins v0.b[12], w0
>> >> > ins v0.b[14], w0
>> >> > ret
>> >> >
>> >> > code-gen with patch:
>> >> > f:
>> >> > dup v0.16b, w0
>> >> > adrpx0, .LC0
>> >> > ldr q1, [x0, #:lo12:.LC0]
>> >> > zip1v0.16b, v0.16b, v1.16b
>> >> > ret
>> >> >
>> >> > Bootstrapped+tested on aarch64-linux-gnu.
>> >> > Does it look OK ?
>> >>
>> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
>> >>
>> >> However, rather than handle this case specially, I think we should instead
>> >> take a divide-and-conquer approach: split the initialiser into even and
>> >> odd elements, find the best way of loading each part, then compare the
>> >> cost of these sequences + ZIP with the cost of the fallback code (the code
>> >> later in aarch64_expand_vector_init).
>> >>
>> >> For example, doing that would allow:
>> >>
>> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
>> >>
>> >> to be loaded more easily, even though the even elements aren't wholly
>> >> constant.
>> > Hi Richard,
>> > I have attached a prototype patch based on the above approach.
>> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by 
>> > generating
>> > same sequence, thus I removed that hunk, and improves the following cases:
>> >
>> > (a)
>> > int8x16_t f_s16(int8_t x)
>> > {
>> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
>> >  x, 5, x, 6, x, 7, x, 8 };
>> > }
>> >
>> > code-gen trunk:
>> > f_s16:
>> > adrpx1, .LC0
>> > ldr q0, [x1, #:lo12:.LC0]
>> > ins v0.b[0], w0
>> > ins v0.b[2], w0
>> > ins v0.b[4], w0
>> > ins v0.b[6], w0
>> > ins v0.b[8], w0
>> > ins v0.b[10], w0
>> > ins v0.b[12], w0
>> > ins v0.b[14], w0
>> > ret
>> >
>> > code-gen with patch:
>> > f_s16:
>> > dup v0.16b, w0
>> > adrpx0, .LC0
>> > ldr q1, [x0, #:lo12:.LC0]
>> > zip1v0.16b, v0.16b, v1.16b
>> > ret
>> >
>> > (b)
>> > int8x16_t f_s16(int8_t x, int8_t y)
>> > {
>> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
>> > 4, y, 5, y, 6, y, 7, y };
>> > }
>> >
>> > code-gen trunk:
>> > f_s16:
>> > adrpx2, .LC0
>> > ldr q0, [x2, #:lo12:.LC0]
>> > ins v0.b[0], w0
>> > ins v0.b[1], w1
>> > ins v0.b[3], w1
>> > ins v0.b[5], w1
>> > ins v0.b[7], w1
>> > ins v0.b[9], w1
>> > ins v0.b[11], w1
>> > ins v0.b[13], w1
>> > ins v0.b[15], w1
>> > ret
>> >
>> > code-gen patch:
>> > f_s16:
>> > adrpx2, .LC0
>> > dup v1.16b, w1
>> > ldr q0, [x2, #:lo12:.LC0]
>> > ins v0.b[0], w0
>> > zip1v0.16b, v0.16b, v1.16b
>> > ret
>>
>> Nice.
>>
>> > There are a couple of issues I have come across:
>> > (1) Choosing element to pad vector.
>> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
>> > with mode V8HI.
>> > We split it into { x, 0, 1, 2 } and { y, y, y, y}
>> > However since the mode is V8HI, we would need to pad the above split 
>> > vectors
>> > with 4 more elements to match up to vector length.
>> > For {x, 0, 1, 2} using any constant is the obvious choice while for {y, y, 
>> > y, y}
>> > using 'y' is the obvious choice thus making them:
>> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
>> > These would be then merged using zip1 which would discard the lower half
>> > of both vectors.
>> > Currently I encoded the above two heuristics in
>> > aarch64_expand_vector_init_get_padded_elem:
>> > (a) If split portion contains a constant, use the constant to pad the 
>> > vector.
>> > (b) If split portion only contains variables, then use the most
>> > frequently repeating variable
>> > to pad the vector.
>> > I suppose tho this could be improved ?
>>
>> I think we should just build two 64-bit vectors (V4HIs) and use a subreg
>> to fill the upper elements with undefined values.
>>
>> I suppose in principle we would have the same problem when splitting
>> a 64-bit vector into 2 32-bit vectors, but it's probably better to punt
>> on that for now.  Eventually it would be worth adding full support fo

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-02-02 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 1 Feb 2023 at 21:56, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Thu, 12 Jan 2023 at 21:21, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni
> >> >  wrote:
> >> >>
> >> >> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford
> >> >>  wrote:
> >> >> >
> >> >> > Richard Sandiford via Gcc-patches  writes:
> >> >> > > Prathamesh Kulkarni  writes:
> >> >> > >> Hi,
> >> >> > >> For the following test-case:
> >> >> > >>
> >> >> > >> int16x8_t foo(int16_t x, int16_t y)
> >> >> > >> {
> >> >> > >>   return (int16x8_t) { x, y, x, y, x, y, x, y };
> >> >> > >> }
> >> >> > >>
> >> >> > >> Code gen at -O3:
> >> >> > >> foo:
> >> >> > >> dupv0.8h, w0
> >> >> > >> ins v0.h[1], w1
> >> >> > >> ins v0.h[3], w1
> >> >> > >> ins v0.h[5], w1
> >> >> > >> ins v0.h[7], w1
> >> >> > >> ret
> >> >> > >>
> >> >> > >> For 16 elements, it results in 8 ins instructions which might not 
> >> >> > >> be
> >> >> > >> optimal perhaps.
> >> >> > >> I guess, the above code-gen would be equivalent to the following ?
> >> >> > >> dup v0.8h, w0
> >> >> > >> dup v1.8h, w1
> >> >> > >> zip1 v0.8h, v0.8h, v1.8h
> >> >> > >>
> >> >> > >> I have attached patch to do the same, if number of elements >= 8,
> >> >> > >> which should be possibly better compared to current code-gen ?
> >> >> > >> Patch passes bootstrap+test on aarch64-linux-gnu.
> >> >> > >> Does the patch look OK ?
> >> >> > >>
> >> >> > >> Thanks,
> >> >> > >> Prathamesh
> >> >> > >>
> >> >> > >> diff --git a/gcc/config/aarch64/aarch64.cc 
> >> >> > >> b/gcc/config/aarch64/aarch64.cc
> >> >> > >> index c91df6f5006..e5dea70e363 100644
> >> >> > >> --- a/gcc/config/aarch64/aarch64.cc
> >> >> > >> +++ b/gcc/config/aarch64/aarch64.cc
> >> >> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, 
> >> >> > >> rtx vals)
> >> >> > >>return;
> >> >> > >>  }
> >> >> > >>
> >> >> > >> +  /* Check for interleaving case.
> >> >> > >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, 
> >> >> > >> y}.
> >> >> > >> + Generate following code:
> >> >> > >> + dup v0.h, x
> >> >> > >> + dup v1.h, y
> >> >> > >> + zip1 v0.h, v0.h, v1.h
> >> >> > >> + for "large enough" initializer.  */
> >> >> > >> +
> >> >> > >> +  if (n_elts >= 8)
> >> >> > >> +{
> >> >> > >> +  int i;
> >> >> > >> +  for (i = 2; i < n_elts; i++)
> >> >> > >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 
> >> >> > >> 2)))
> >> >> > >> +  break;
> >> >> > >> +
> >> >> > >> +  if (i == n_elts)
> >> >> > >> +{
> >> >> > >> +  machine_mode mode = GET_MODE (target);
> >> >> > >> +  rtx dest[2];
> >> >> > >> +
> >> >> > >> +  for (int i = 0; i < 2; i++)
> >> >> > >> +{
> >> >> > >> +  rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), 
> >> >> > >> XVECEXP (vals, 0, i));
> >> >> > >
> >> >> > > Formatting nit: long line.
> >> >> > >
> >> >> > >> +  dest[i] = gen_reg_rtx (mode);
> >> >> > >> +  aarch64_emit_move (dest[i], gen_vec_duplicate (mode, 
> >> >> > >> x));
> >> >> > >> +}
> >> >> > >
> >> >> > > This could probably be written:
> >> >> > >
> >> >> > > for (int i = 0; i < 2; i++)
> >> >> > >   {
> >> >> > > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 
> >> >> > > 0, i));
> >> >> > > dest[i] = force_reg (GET_MODE_INNER (mode), x);
> >> >> >
> >> >> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.
> >> >> Thanks, I have pushed the change in
> >> >> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running
> >> >> bootstrap+test on aarch64-linux-gnu.
> >> > Hi Richard,
> >> > I have attached a patch that extends the transform if one half is dup
> >> > and other is set of constants.
> >> > For eg:
> >> > int8x16_t f(int8_t x)
> >> > {
> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
> >> > }
> >> >
> >> > code-gen trunk:
> >> > f:
> >> > adrpx1, .LC0
> >> > ldr q0, [x1, #:lo12:.LC0]
> >> > ins v0.b[0], w0
> >> > ins v0.b[2], w0
> >> > ins v0.b[4], w0
> >> > ins v0.b[6], w0
> >> > ins v0.b[8], w0
> >> > ins v0.b[10], w0
> >> > ins v0.b[12], w0
> >> > ins v0.b[14], w0
> >> > ret
> >> >
> >> > code-gen with patch:
> >> > f:
> >> > dup v0.16b, w0
> >> > adrpx0, .LC0
> >> > ldr q1, [x0, #:lo12:.LC0]
> >> > zip1v0.16b, v0.16b, v1.16b
> >> > ret
> >> >
> >> > Bootstrapped+tested on aarch64-linux-gnu.
> >> > Does it look OK ?
> >>
> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
> >>
> >> However, rather than handle this case specially, I think we should instead
> >> take a divide-and-conquer approach: split the initialis

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-02-01 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> On Thu, 12 Jan 2023 at 21:21, Richard Sandiford
>  wrote:
>>
>> Prathamesh Kulkarni  writes:
>> > On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni
>> >  wrote:
>> >>
>> >> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford
>> >>  wrote:
>> >> >
>> >> > Richard Sandiford via Gcc-patches  writes:
>> >> > > Prathamesh Kulkarni  writes:
>> >> > >> Hi,
>> >> > >> For the following test-case:
>> >> > >>
>> >> > >> int16x8_t foo(int16_t x, int16_t y)
>> >> > >> {
>> >> > >>   return (int16x8_t) { x, y, x, y, x, y, x, y };
>> >> > >> }
>> >> > >>
>> >> > >> Code gen at -O3:
>> >> > >> foo:
>> >> > >> dupv0.8h, w0
>> >> > >> ins v0.h[1], w1
>> >> > >> ins v0.h[3], w1
>> >> > >> ins v0.h[5], w1
>> >> > >> ins v0.h[7], w1
>> >> > >> ret
>> >> > >>
>> >> > >> For 16 elements, it results in 8 ins instructions which might not be
>> >> > >> optimal perhaps.
>> >> > >> I guess, the above code-gen would be equivalent to the following ?
>> >> > >> dup v0.8h, w0
>> >> > >> dup v1.8h, w1
>> >> > >> zip1 v0.8h, v0.8h, v1.8h
>> >> > >>
>> >> > >> I have attached patch to do the same, if number of elements >= 8,
>> >> > >> which should be possibly better compared to current code-gen ?
>> >> > >> Patch passes bootstrap+test on aarch64-linux-gnu.
>> >> > >> Does the patch look OK ?
>> >> > >>
>> >> > >> Thanks,
>> >> > >> Prathamesh
>> >> > >>
>> >> > >> diff --git a/gcc/config/aarch64/aarch64.cc 
>> >> > >> b/gcc/config/aarch64/aarch64.cc
>> >> > >> index c91df6f5006..e5dea70e363 100644
>> >> > >> --- a/gcc/config/aarch64/aarch64.cc
>> >> > >> +++ b/gcc/config/aarch64/aarch64.cc
>> >> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx 
>> >> > >> vals)
>> >> > >>return;
>> >> > >>  }
>> >> > >>
>> >> > >> +  /* Check for interleaving case.
>> >> > >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
>> >> > >> + Generate following code:
>> >> > >> + dup v0.h, x
>> >> > >> + dup v1.h, y
>> >> > >> + zip1 v0.h, v0.h, v1.h
>> >> > >> + for "large enough" initializer.  */
>> >> > >> +
>> >> > >> +  if (n_elts >= 8)
>> >> > >> +{
>> >> > >> +  int i;
>> >> > >> +  for (i = 2; i < n_elts; i++)
>> >> > >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 
>> >> > >> 2)))
>> >> > >> +  break;
>> >> > >> +
>> >> > >> +  if (i == n_elts)
>> >> > >> +{
>> >> > >> +  machine_mode mode = GET_MODE (target);
>> >> > >> +  rtx dest[2];
>> >> > >> +
>> >> > >> +  for (int i = 0; i < 2; i++)
>> >> > >> +{
>> >> > >> +  rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP 
>> >> > >> (vals, 0, i));
>> >> > >
>> >> > > Formatting nit: long line.
>> >> > >
>> >> > >> +  dest[i] = gen_reg_rtx (mode);
>> >> > >> +  aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
>> >> > >> +}
>> >> > >
>> >> > > This could probably be written:
>> >> > >
>> >> > > for (int i = 0; i < 2; i++)
>> >> > >   {
>> >> > > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, 
>> >> > > i));
>> >> > > dest[i] = force_reg (GET_MODE_INNER (mode), x);
>> >> >
>> >> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.
>> >> Thanks, I have pushed the change in
>> >> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running
>> >> bootstrap+test on aarch64-linux-gnu.
>> > Hi Richard,
>> > I have attached a patch that extends the transform if one half is dup
>> > and other is set of constants.
>> > For eg:
>> > int8x16_t f(int8_t x)
>> > {
>> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
>> > }
>> >
>> > code-gen trunk:
>> > f:
>> > adrpx1, .LC0
>> > ldr q0, [x1, #:lo12:.LC0]
>> > ins v0.b[0], w0
>> > ins v0.b[2], w0
>> > ins v0.b[4], w0
>> > ins v0.b[6], w0
>> > ins v0.b[8], w0
>> > ins v0.b[10], w0
>> > ins v0.b[12], w0
>> > ins v0.b[14], w0
>> > ret
>> >
>> > code-gen with patch:
>> > f:
>> > dup v0.16b, w0
>> > adrpx0, .LC0
>> > ldr q1, [x0, #:lo12:.LC0]
>> > zip1v0.16b, v0.16b, v1.16b
>> > ret
>> >
>> > Bootstrapped+tested on aarch64-linux-gnu.
>> > Does it look OK ?
>>
>> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
>>
>> However, rather than handle this case specially, I think we should instead
>> take a divide-and-conquer approach: split the initialiser into even and
>> odd elements, find the best way of loading each part, then compare the
>> cost of these sequences + ZIP with the cost of the fallback code (the code
>> later in aarch64_expand_vector_init).
>>
>> For example, doing that would allow:
>>
>>   { x, y, 0, y, 0, y, 0, y, 0, y }
>>
>> to be loaded more easily, even though the even elements aren't wholly
>> 

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-02-01 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 12 Jan 2023 at 21:21, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni
> >  wrote:
> >>
> >> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford
> >>  wrote:
> >> >
> >> > Richard Sandiford via Gcc-patches  writes:
> >> > > Prathamesh Kulkarni  writes:
> >> > >> Hi,
> >> > >> For the following test-case:
> >> > >>
> >> > >> int16x8_t foo(int16_t x, int16_t y)
> >> > >> {
> >> > >>   return (int16x8_t) { x, y, x, y, x, y, x, y };
> >> > >> }
> >> > >>
> >> > >> Code gen at -O3:
> >> > >> foo:
> >> > >> dupv0.8h, w0
> >> > >> ins v0.h[1], w1
> >> > >> ins v0.h[3], w1
> >> > >> ins v0.h[5], w1
> >> > >> ins v0.h[7], w1
> >> > >> ret
> >> > >>
> >> > >> For 16 elements, it results in 8 ins instructions which might not be
> >> > >> optimal perhaps.
> >> > >> I guess, the above code-gen would be equivalent to the following ?
> >> > >> dup v0.8h, w0
> >> > >> dup v1.8h, w1
> >> > >> zip1 v0.8h, v0.8h, v1.8h
> >> > >>
> >> > >> I have attached patch to do the same, if number of elements >= 8,
> >> > >> which should be possibly better compared to current code-gen ?
> >> > >> Patch passes bootstrap+test on aarch64-linux-gnu.
> >> > >> Does the patch look OK ?
> >> > >>
> >> > >> Thanks,
> >> > >> Prathamesh
> >> > >>
> >> > >> diff --git a/gcc/config/aarch64/aarch64.cc 
> >> > >> b/gcc/config/aarch64/aarch64.cc
> >> > >> index c91df6f5006..e5dea70e363 100644
> >> > >> --- a/gcc/config/aarch64/aarch64.cc
> >> > >> +++ b/gcc/config/aarch64/aarch64.cc
> >> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx 
> >> > >> vals)
> >> > >>return;
> >> > >>  }
> >> > >>
> >> > >> +  /* Check for interleaving case.
> >> > >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> >> > >> + Generate following code:
> >> > >> + dup v0.h, x
> >> > >> + dup v1.h, y
> >> > >> + zip1 v0.h, v0.h, v1.h
> >> > >> + for "large enough" initializer.  */
> >> > >> +
> >> > >> +  if (n_elts >= 8)
> >> > >> +{
> >> > >> +  int i;
> >> > >> +  for (i = 2; i < n_elts; i++)
> >> > >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 
> >> > >> 2)))
> >> > >> +  break;
> >> > >> +
> >> > >> +  if (i == n_elts)
> >> > >> +{
> >> > >> +  machine_mode mode = GET_MODE (target);
> >> > >> +  rtx dest[2];
> >> > >> +
> >> > >> +  for (int i = 0; i < 2; i++)
> >> > >> +{
> >> > >> +  rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP 
> >> > >> (vals, 0, i));
> >> > >
> >> > > Formatting nit: long line.
> >> > >
> >> > >> +  dest[i] = gen_reg_rtx (mode);
> >> > >> +  aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
> >> > >> +}
> >> > >
> >> > > This could probably be written:
> >> > >
> >> > > for (int i = 0; i < 2; i++)
> >> > >   {
> >> > > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, 
> >> > > i));
> >> > > dest[i] = force_reg (GET_MODE_INNER (mode), x);
> >> >
> >> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.
> >> Thanks, I have pushed the change in
> >> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running
> >> bootstrap+test on aarch64-linux-gnu.
> > Hi Richard,
> > I have attached a patch that extends the transform if one half is dup
> > and other is set of constants.
> > For eg:
> > int8x16_t f(int8_t x)
> > {
> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
> > }
> >
> > code-gen trunk:
> > f:
> > adrpx1, .LC0
> > ldr q0, [x1, #:lo12:.LC0]
> > ins v0.b[0], w0
> > ins v0.b[2], w0
> > ins v0.b[4], w0
> > ins v0.b[6], w0
> > ins v0.b[8], w0
> > ins v0.b[10], w0
> > ins v0.b[12], w0
> > ins v0.b[14], w0
> > ret
> >
> > code-gen with patch:
> > f:
> > dup v0.16b, w0
> > adrpx0, .LC0
> > ldr q1, [x0, #:lo12:.LC0]
> > zip1v0.16b, v0.16b, v1.16b
> > ret
> >
> > Bootstrapped+tested on aarch64-linux-gnu.
> > Does it look OK ?
>
> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
>
> However, rather than handle this case specially, I think we should instead
> take a divide-and-conquer approach: split the initialiser into even and
> odd elements, find the best way of loading each part, then compare the
> cost of these sequences + ZIP with the cost of the fallback code (the code
> later in aarch64_expand_vector_init).
>
> For example, doing that would allow:
>
>   { x, y, 0, y, 0, y, 0, y, 0, y }
>
> to be loaded more easily, even though the even elements aren't wholly
> constant.
Hi Richard,
I have attached a prototype patch based on the above approach.
It subsumes specializing for above {x, y, x, y, x, y, x, y} case by generating
same sequence

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2023-01-12 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni
>  wrote:
>>
>> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford
>>  wrote:
>> >
>> > Richard Sandiford via Gcc-patches  writes:
>> > > Prathamesh Kulkarni  writes:
>> > >> Hi,
>> > >> For the following test-case:
>> > >>
>> > >> int16x8_t foo(int16_t x, int16_t y)
>> > >> {
>> > >>   return (int16x8_t) { x, y, x, y, x, y, x, y };
>> > >> }
>> > >>
>> > >> Code gen at -O3:
>> > >> foo:
>> > >> dupv0.8h, w0
>> > >> ins v0.h[1], w1
>> > >> ins v0.h[3], w1
>> > >> ins v0.h[5], w1
>> > >> ins v0.h[7], w1
>> > >> ret
>> > >>
>> > >> For 16 elements, it results in 8 ins instructions which might not be
>> > >> optimal perhaps.
>> > >> I guess, the above code-gen would be equivalent to the following ?
>> > >> dup v0.8h, w0
>> > >> dup v1.8h, w1
>> > >> zip1 v0.8h, v0.8h, v1.8h
>> > >>
>> > >> I have attached patch to do the same, if number of elements >= 8,
>> > >> which should be possibly better compared to current code-gen ?
>> > >> Patch passes bootstrap+test on aarch64-linux-gnu.
>> > >> Does the patch look OK ?
>> > >>
>> > >> Thanks,
>> > >> Prathamesh
>> > >>
>> > >> diff --git a/gcc/config/aarch64/aarch64.cc 
>> > >> b/gcc/config/aarch64/aarch64.cc
>> > >> index c91df6f5006..e5dea70e363 100644
>> > >> --- a/gcc/config/aarch64/aarch64.cc
>> > >> +++ b/gcc/config/aarch64/aarch64.cc
>> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx 
>> > >> vals)
>> > >>return;
>> > >>  }
>> > >>
>> > >> +  /* Check for interleaving case.
>> > >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
>> > >> + Generate following code:
>> > >> + dup v0.h, x
>> > >> + dup v1.h, y
>> > >> + zip1 v0.h, v0.h, v1.h
>> > >> + for "large enough" initializer.  */
>> > >> +
>> > >> +  if (n_elts >= 8)
>> > >> +{
>> > >> +  int i;
>> > >> +  for (i = 2; i < n_elts; i++)
>> > >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
>> > >> +  break;
>> > >> +
>> > >> +  if (i == n_elts)
>> > >> +{
>> > >> +  machine_mode mode = GET_MODE (target);
>> > >> +  rtx dest[2];
>> > >> +
>> > >> +  for (int i = 0; i < 2; i++)
>> > >> +{
>> > >> +  rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP 
>> > >> (vals, 0, i));
>> > >
>> > > Formatting nit: long line.
>> > >
>> > >> +  dest[i] = gen_reg_rtx (mode);
>> > >> +  aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
>> > >> +}
>> > >
>> > > This could probably be written:
>> > >
>> > > for (int i = 0; i < 2; i++)
>> > >   {
>> > > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
>> > > dest[i] = force_reg (GET_MODE_INNER (mode), x);
>> >
>> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.
>> Thanks, I have pushed the change in
>> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running
>> bootstrap+test on aarch64-linux-gnu.
> Hi Richard,
> I have attached a patch that extends the transform if one half is dup
> and other is set of constants.
> For eg:
> int8x16_t f(int8_t x)
> {
>   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
> }
>
> code-gen trunk:
> f:
> adrpx1, .LC0
> ldr q0, [x1, #:lo12:.LC0]
> ins v0.b[0], w0
> ins v0.b[2], w0
> ins v0.b[4], w0
> ins v0.b[6], w0
> ins v0.b[8], w0
> ins v0.b[10], w0
> ins v0.b[12], w0
> ins v0.b[14], w0
> ret
>
> code-gen with patch:
> f:
> dup v0.16b, w0
> adrpx0, .LC0
> ldr q1, [x0, #:lo12:.LC0]
> zip1v0.16b, v0.16b, v1.16b
> ret
>
> Bootstrapped+tested on aarch64-linux-gnu.
> Does it look OK ?

Looks like a nice improvement.  It'll need to wait for GCC 14 now though.

However, rather than handle this case specially, I think we should instead
take a divide-and-conquer approach: split the initialiser into even and
odd elements, find the best way of loading each part, then compare the
cost of these sequences + ZIP with the cost of the fallback code (the code
later in aarch64_expand_vector_init).

For example, doing that would allow:

  { x, y, 0, y, 0, y, 0, y, 0, y }

to be loaded more easily, even though the even elements aren't wholly
constant.

Thanks,
Richard

>
> Thanks,
> Prathamesh
>>
>
>> Thanks,
>> Prathamesh
>> >
>> > >   }
>> > >
>> > > which avoids forcing constant elements into a register before the 
>> > > duplication.
>> > > OK with that change if it works.
>> > >
>> > > Thanks,
>> > > Richard
>> > >
>> > >> +
>> > >> +  rtvec v = gen_rtvec (2, dest[0], dest[1]);
>> > >> +  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
>> > >> +  return;
>> > >> +}
>> > >> +}
>> > >> +
>> > >>enum insn_code icode 

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2022-12-25 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni
 wrote:
>
> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford
>  wrote:
> >
> > Richard Sandiford via Gcc-patches  writes:
> > > Prathamesh Kulkarni  writes:
> > >> Hi,
> > >> For the following test-case:
> > >>
> > >> int16x8_t foo(int16_t x, int16_t y)
> > >> {
> > >>   return (int16x8_t) { x, y, x, y, x, y, x, y };
> > >> }
> > >>
> > >> Code gen at -O3:
> > >> foo:
> > >> dupv0.8h, w0
> > >> ins v0.h[1], w1
> > >> ins v0.h[3], w1
> > >> ins v0.h[5], w1
> > >> ins v0.h[7], w1
> > >> ret
> > >>
> > >> For 16 elements, it results in 8 ins instructions which might not be
> > >> optimal perhaps.
> > >> I guess, the above code-gen would be equivalent to the following ?
> > >> dup v0.8h, w0
> > >> dup v1.8h, w1
> > >> zip1 v0.8h, v0.8h, v1.8h
> > >>
> > >> I have attached patch to do the same, if number of elements >= 8,
> > >> which should be possibly better compared to current code-gen ?
> > >> Patch passes bootstrap+test on aarch64-linux-gnu.
> > >> Does the patch look OK ?
> > >>
> > >> Thanks,
> > >> Prathamesh
> > >>
> > >> diff --git a/gcc/config/aarch64/aarch64.cc 
> > >> b/gcc/config/aarch64/aarch64.cc
> > >> index c91df6f5006..e5dea70e363 100644
> > >> --- a/gcc/config/aarch64/aarch64.cc
> > >> +++ b/gcc/config/aarch64/aarch64.cc
> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx 
> > >> vals)
> > >>return;
> > >>  }
> > >>
> > >> +  /* Check for interleaving case.
> > >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> > >> + Generate following code:
> > >> + dup v0.h, x
> > >> + dup v1.h, y
> > >> + zip1 v0.h, v0.h, v1.h
> > >> + for "large enough" initializer.  */
> > >> +
> > >> +  if (n_elts >= 8)
> > >> +{
> > >> +  int i;
> > >> +  for (i = 2; i < n_elts; i++)
> > >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> > >> +  break;
> > >> +
> > >> +  if (i == n_elts)
> > >> +{
> > >> +  machine_mode mode = GET_MODE (target);
> > >> +  rtx dest[2];
> > >> +
> > >> +  for (int i = 0; i < 2; i++)
> > >> +{
> > >> +  rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP 
> > >> (vals, 0, i));
> > >
> > > Formatting nit: long line.
> > >
> > >> +  dest[i] = gen_reg_rtx (mode);
> > >> +  aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
> > >> +}
> > >
> > > This could probably be written:
> > >
> > > for (int i = 0; i < 2; i++)
> > >   {
> > > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> > > dest[i] = force_reg (GET_MODE_INNER (mode), x);
> >
> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.
> Thanks, I have pushed the change in
> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running
> bootstrap+test on aarch64-linux-gnu.
Hi Richard,
I have attached a patch that extends the transform if one half is dup
and other is set of constants.
For eg:
int8x16_t f(int8_t x)
{
  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
}

code-gen trunk:
f:
adrpx1, .LC0
ldr q0, [x1, #:lo12:.LC0]
ins v0.b[0], w0
ins v0.b[2], w0
ins v0.b[4], w0
ins v0.b[6], w0
ins v0.b[8], w0
ins v0.b[10], w0
ins v0.b[12], w0
ins v0.b[14], w0
ret

code-gen with patch:
f:
dup v0.16b, w0
adrpx0, .LC0
ldr q1, [x0, #:lo12:.LC0]
zip1v0.16b, v0.16b, v1.16b
ret

Bootstrapped+tested on aarch64-linux-gnu.
Does it look OK ?

Thanks,
Prathamesh
>

> Thanks,
> Prathamesh
> >
> > >   }
> > >
> > > which avoids forcing constant elements into a register before the 
> > > duplication.
> > > OK with that change if it works.
> > >
> > > Thanks,
> > > Richard
> > >
> > >> +
> > >> +  rtvec v = gen_rtvec (2, dest[0], dest[1]);
> > >> +  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> > >> +  return;
> > >> +}
> > >> +}
> > >> +
> > >>enum insn_code icode = optab_handler (vec_set_optab, mode);
> > >>gcc_assert (icode != CODE_FOR_nothing);
> > >>
> > >> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c 
> > >> b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> > >> new file mode 100644
> > >> index 000..ee775048589
> > >> --- /dev/null
> > >> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> > >> @@ -0,0 +1,37 @@
> > >> +/* { dg-do compile } */
> > >> +/* { dg-options "-O3" } */
> > >> +/* { dg-final { check-function-bodies "**" "" "" } } */
> > >> +
> > >> +#include 
> > >> +
> > >> +/*
> > >> +** foo:
> > >> +**  ...
> > >> +**  dup v[0-9]+\.8h, w[0-9]+
> > >> +**  dup v[0-9]+\.8h, w[0-9]+
> > >> +**  zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> > >> +**  ...
> > >> +**  ret
> 

Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2022-12-05 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 5 Dec 2022 at 16:50, Richard Sandiford
 wrote:
>
> Richard Sandiford via Gcc-patches  writes:
> > Prathamesh Kulkarni  writes:
> >> Hi,
> >> For the following test-case:
> >>
> >> int16x8_t foo(int16_t x, int16_t y)
> >> {
> >>   return (int16x8_t) { x, y, x, y, x, y, x, y };
> >> }
> >>
> >> Code gen at -O3:
> >> foo:
> >> dupv0.8h, w0
> >> ins v0.h[1], w1
> >> ins v0.h[3], w1
> >> ins v0.h[5], w1
> >> ins v0.h[7], w1
> >> ret
> >>
> >> For 16 elements, it results in 8 ins instructions which might not be
> >> optimal perhaps.
> >> I guess, the above code-gen would be equivalent to the following ?
> >> dup v0.8h, w0
> >> dup v1.8h, w1
> >> zip1 v0.8h, v0.8h, v1.8h
> >>
> >> I have attached patch to do the same, if number of elements >= 8,
> >> which should be possibly better compared to current code-gen ?
> >> Patch passes bootstrap+test on aarch64-linux-gnu.
> >> Does the patch look OK ?
> >>
> >> Thanks,
> >> Prathamesh
> >>
> >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> >> index c91df6f5006..e5dea70e363 100644
> >> --- a/gcc/config/aarch64/aarch64.cc
> >> +++ b/gcc/config/aarch64/aarch64.cc
> >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >>return;
> >>  }
> >>
> >> +  /* Check for interleaving case.
> >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> >> + Generate following code:
> >> + dup v0.h, x
> >> + dup v1.h, y
> >> + zip1 v0.h, v0.h, v1.h
> >> + for "large enough" initializer.  */
> >> +
> >> +  if (n_elts >= 8)
> >> +{
> >> +  int i;
> >> +  for (i = 2; i < n_elts; i++)
> >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> >> +  break;
> >> +
> >> +  if (i == n_elts)
> >> +{
> >> +  machine_mode mode = GET_MODE (target);
> >> +  rtx dest[2];
> >> +
> >> +  for (int i = 0; i < 2; i++)
> >> +{
> >> +  rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 
> >> 0, i));
> >
> > Formatting nit: long line.
> >
> >> +  dest[i] = gen_reg_rtx (mode);
> >> +  aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
> >> +}
> >
> > This could probably be written:
> >
> > for (int i = 0; i < 2; i++)
> >   {
> > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> > dest[i] = force_reg (GET_MODE_INNER (mode), x);
>
> Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.
Thanks, I have pushed the change in
769370f3e2e04823c8a621d8ffa756dd83ebf21e after running
bootstrap+test on aarch64-linux-gnu.

Thanks,
Prathamesh
>
> >   }
> >
> > which avoids forcing constant elements into a register before the 
> > duplication.
> > OK with that change if it works.
> >
> > Thanks,
> > Richard
> >
> >> +
> >> +  rtvec v = gen_rtvec (2, dest[0], dest[1]);
> >> +  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> >> +  return;
> >> +}
> >> +}
> >> +
> >>enum insn_code icode = optab_handler (vec_set_optab, mode);
> >>gcc_assert (icode != CODE_FOR_nothing);
> >>
> >> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c 
> >> b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> >> new file mode 100644
> >> index 000..ee775048589
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> >> @@ -0,0 +1,37 @@
> >> +/* { dg-do compile } */
> >> +/* { dg-options "-O3" } */
> >> +/* { dg-final { check-function-bodies "**" "" "" } } */
> >> +
> >> +#include 
> >> +
> >> +/*
> >> +** foo:
> >> +**  ...
> >> +**  dup v[0-9]+\.8h, w[0-9]+
> >> +**  dup v[0-9]+\.8h, w[0-9]+
> >> +**  zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >> +**  ...
> >> +**  ret
> >> +*/
> >> +
> >> +int16x8_t foo(int16_t x, int y)
> >> +{
> >> +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y};
> >> +  return v;
> >> +}
> >> +
> >> +/*
> >> +** foo2:
> >> +**  ...
> >> +**  dup v[0-9]+\.8h, w[0-9]+
> >> +**  moviv[0-9]+\.8h, 0x1
> >> +**  zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >> +**  ...
> >> +**  ret
> >> +*/
> >> +
> >> +int16x8_t foo2(int16_t x)
> >> +{
> >> +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1};
> >> +  return v;
> >> +}


Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2022-12-05 Thread Richard Sandiford via Gcc-patches
Richard Sandiford via Gcc-patches  writes:
> Prathamesh Kulkarni  writes:
>> Hi,
>> For the following test-case:
>>
>> int16x8_t foo(int16_t x, int16_t y)
>> {
>>   return (int16x8_t) { x, y, x, y, x, y, x, y };
>> }
>>
>> Code gen at -O3:
>> foo:
>> dupv0.8h, w0
>> ins v0.h[1], w1
>> ins v0.h[3], w1
>> ins v0.h[5], w1
>> ins v0.h[7], w1
>> ret
>>
>> For 16 elements, it results in 8 ins instructions which might not be
>> optimal perhaps.
>> I guess, the above code-gen would be equivalent to the following ?
>> dup v0.8h, w0
>> dup v1.8h, w1
>> zip1 v0.8h, v0.8h, v1.8h
>>
>> I have attached patch to do the same, if number of elements >= 8,
>> which should be possibly better compared to current code-gen ?
>> Patch passes bootstrap+test on aarch64-linux-gnu.
>> Does the patch look OK ?
>>
>> Thanks,
>> Prathamesh
>>
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index c91df6f5006..e5dea70e363 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>>return;
>>  }
>>  
>> +  /* Check for interleaving case.
>> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
>> + Generate following code:
>> + dup v0.h, x
>> + dup v1.h, y
>> + zip1 v0.h, v0.h, v1.h
>> + for "large enough" initializer.  */
>> +
>> +  if (n_elts >= 8)
>> +{
>> +  int i;
>> +  for (i = 2; i < n_elts; i++)
>> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
>> +  break;
>> +
>> +  if (i == n_elts)
>> +{
>> +  machine_mode mode = GET_MODE (target);
>> +  rtx dest[2];
>> +
>> +  for (int i = 0; i < 2; i++)
>> +{
>> +  rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 
>> 0, i));
>
> Formatting nit: long line.
>
>> +  dest[i] = gen_reg_rtx (mode);
>> +  aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
>> +}
>
> This could probably be written:
>
> for (int i = 0; i < 2; i++)
>   {
> rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> dest[i] = force_reg (GET_MODE_INNER (mode), x);

Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.

>   }
>
> which avoids forcing constant elements into a register before the duplication.
> OK with that change if it works.
>
> Thanks,
> Richard
>
>> +
>> +  rtvec v = gen_rtvec (2, dest[0], dest[1]);
>> +  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
>> +  return;
>> +}
>> +}
>> +
>>enum insn_code icode = optab_handler (vec_set_optab, mode);
>>gcc_assert (icode != CODE_FOR_nothing);
>>  
>> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c 
>> b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
>> new file mode 100644
>> index 000..ee775048589
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
>> @@ -0,0 +1,37 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3" } */
>> +/* { dg-final { check-function-bodies "**" "" "" } } */
>> +
>> +#include 
>> +
>> +/*
>> +** foo:
>> +**  ...
>> +**  dup v[0-9]+\.8h, w[0-9]+
>> +**  dup v[0-9]+\.8h, w[0-9]+
>> +**  zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>> +**  ...
>> +**  ret
>> +*/
>> +
>> +int16x8_t foo(int16_t x, int y)
>> +{
>> +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; 
>> +  return v;
>> +}
>> +
>> +/*
>> +** foo2:
>> +**  ...
>> +**  dup v[0-9]+\.8h, w[0-9]+
>> +**  moviv[0-9]+\.8h, 0x1
>> +**  zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>> +**  ...
>> +**  ret
>> +*/
>> +
>> +int16x8_t foo2(int16_t x) 
>> +{
>> +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; 
>> +  return v;
>> +}


Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2022-12-05 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> Hi,
> For the following test-case:
>
> int16x8_t foo(int16_t x, int16_t y)
> {
>   return (int16x8_t) { x, y, x, y, x, y, x, y };
> }
>
> Code gen at -O3:
> foo:
> dupv0.8h, w0
> ins v0.h[1], w1
> ins v0.h[3], w1
> ins v0.h[5], w1
> ins v0.h[7], w1
> ret
>
> For 16 elements, it results in 8 ins instructions which might not be
> optimal perhaps.
> I guess, the above code-gen would be equivalent to the following ?
> dup v0.8h, w0
> dup v1.8h, w1
> zip1 v0.8h, v0.8h, v1.8h
>
> I have attached patch to do the same, if number of elements >= 8,
> which should be possibly better compared to current code-gen ?
> Patch passes bootstrap+test on aarch64-linux-gnu.
> Does the patch look OK ?
>
> Thanks,
> Prathamesh
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index c91df6f5006..e5dea70e363 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>return;
>  }
>  
> +  /* Check for interleaving case.
> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> + Generate following code:
> + dup v0.h, x
> + dup v1.h, y
> + zip1 v0.h, v0.h, v1.h
> + for "large enough" initializer.  */
> +
> +  if (n_elts >= 8)
> +{
> +  int i;
> +  for (i = 2; i < n_elts; i++)
> + if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> +   break;
> +
> +  if (i == n_elts)
> + {
> +   machine_mode mode = GET_MODE (target);
> +   rtx dest[2];
> +
> +   for (int i = 0; i < 2; i++)
> + {
> +   rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 
> 0, i));

Formatting nit: long line.

> +   dest[i] = gen_reg_rtx (mode);
> +   aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
> + }

This could probably be written:

  for (int i = 0; i < 2; i++)
{
  rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
  dest[i] = force_reg (GET_MODE_INNER (mode), x);
}

which avoids forcing constant elements into a register before the duplication.

OK with that change if it works.

Thanks,
Richard

> +
> +   rtvec v = gen_rtvec (2, dest[0], dest[1]);
> +   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> +   return;
> + }
> +}
> +
>enum insn_code icode = optab_handler (vec_set_optab, mode);
>gcc_assert (icode != CODE_FOR_nothing);
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c 
> b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> new file mode 100644
> index 000..ee775048589
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> @@ -0,0 +1,37 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include 
> +
> +/*
> +** foo:
> +**   ...
> +**   dup v[0-9]+\.8h, w[0-9]+
> +**   dup v[0-9]+\.8h, w[0-9]+
> +**   zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> +**   ...
> +**   ret
> +*/
> +
> +int16x8_t foo(int16_t x, int y)
> +{
> +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; 
> +  return v;
> +}
> +
> +/*
> +** foo2:
> +**   ...
> +**   dup v[0-9]+\.8h, w[0-9]+
> +**   moviv[0-9]+\.8h, 0x1
> +**   zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> +**   ...
> +**   ret
> +*/
> +
> +int16x8_t foo2(int16_t x) 
> +{
> +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; 
> +  return v;
> +}


Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2022-11-29 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 29 Nov 2022 at 20:43, Andrew Pinski  wrote:
>
> On Tue, Nov 29, 2022 at 6:40 AM Prathamesh Kulkarni via Gcc-patches
>  wrote:
> >
> > Hi,
> > For the following test-case:
> >
> > int16x8_t foo(int16_t x, int16_t y)
> > {
> >   return (int16x8_t) { x, y, x, y, x, y, x, y };
> > }
>
> (Not to block this patch)
> Seems like this trick can be done even with less than perfect initializer too:
> e.g.
> int16x8_t foo(int16_t x, int16_t y)
> {
>   return (int16x8_t) { x, y, x, y, x, y, x, 0 };
> }
>
> Which should generate something like:
> dup v0.8h, w0
> dup v1.8h, w1
> zip1 v0.8h, v0.8h, v1.8h
> ins v0.h[7], wzr
Hi Andrew,
Nice catch, thanks for the suggestions!
More generally, code-gen with constants involved seems to be sub-optimal.
For example:
int16x8_t foo(int16_t x)
{
  return (int16x8_t) { x, x, x, x, x, x, x, 1 };
}

results in:
foo:
moviv0.8h, 0x1
ins v0.h[0], w0
ins v0.h[1], w0
ins v0.h[2], w0
ins v0.h[3], w0
ins v0.h[4], w0
ins v0.h[5], w0
ins v0.h[6], w0
ret

which I suppose could instead be the following ?
foo:
dup v0.8h, w0
movw1, 0x1
ins   v0.h[7], w1
ret

I will try to address this in follow up patch.

Thanks,
Prathamesh

>
> Thanks,
> Andrew Pinski
>
>
> >
> > Code gen at -O3:
> > foo:
> > dupv0.8h, w0
> > ins v0.h[1], w1
> > ins v0.h[3], w1
> > ins v0.h[5], w1
> > ins v0.h[7], w1
> > ret
> >
> > For 16 elements, it results in 8 ins instructions which might not be
> > optimal perhaps.
> > I guess, the above code-gen would be equivalent to the following ?
> > dup v0.8h, w0
> > dup v1.8h, w1
> > zip1 v0.8h, v0.8h, v1.8h
> >
> > I have attached patch to do the same, if number of elements >= 8,
> > which should be possibly better compared to current code-gen ?
> > Patch passes bootstrap+test on aarch64-linux-gnu.
> > Does the patch look OK ?
> >
> > Thanks,
> > Prathamesh


Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector

2022-11-29 Thread Andrew Pinski via Gcc-patches
On Tue, Nov 29, 2022 at 6:40 AM Prathamesh Kulkarni via Gcc-patches
 wrote:
>
> Hi,
> For the following test-case:
>
> int16x8_t foo(int16_t x, int16_t y)
> {
>   return (int16x8_t) { x, y, x, y, x, y, x, y };
> }

(Not to block this patch)
Seems like this trick can be done even with less than perfect initializer too:
e.g.
int16x8_t foo(int16_t x, int16_t y)
{
  return (int16x8_t) { x, y, x, y, x, y, x, 0 };
}

Which should generate something like:
dup v0.8h, w0
dup v1.8h, w1
zip1 v0.8h, v0.8h, v1.8h
ins v0.h[7], wzr

Thanks,
Andrew Pinski


>
> Code gen at -O3:
> foo:
> dupv0.8h, w0
> ins v0.h[1], w1
> ins v0.h[3], w1
> ins v0.h[5], w1
> ins v0.h[7], w1
> ret
>
> For 16 elements, it results in 8 ins instructions which might not be
> optimal perhaps.
> I guess, the above code-gen would be equivalent to the following ?
> dup v0.8h, w0
> dup v1.8h, w1
> zip1 v0.8h, v0.8h, v1.8h
>
> I have attached patch to do the same, if number of elements >= 8,
> which should be possibly better compared to current code-gen ?
> Patch passes bootstrap+test on aarch64-linux-gnu.
> Does the patch look OK ?
>
> Thanks,
> Prathamesh


[aarch64] Use dup and zip1 for interleaving elements in initializing vector

2022-11-29 Thread Prathamesh Kulkarni via Gcc-patches
Hi,
For the following test-case:

int16x8_t foo(int16_t x, int16_t y)
{
  return (int16x8_t) { x, y, x, y, x, y, x, y };
}

Code gen at -O3:
foo:
dupv0.8h, w0
ins v0.h[1], w1
ins v0.h[3], w1
ins v0.h[5], w1
ins v0.h[7], w1
ret

For 16 elements, it results in 8 ins instructions which might not be
optimal perhaps.
I guess, the above code-gen would be equivalent to the following ?
dup v0.8h, w0
dup v1.8h, w1
zip1 v0.8h, v0.8h, v1.8h

I have attached patch to do the same, if number of elements >= 8,
which should be possibly better compared to current code-gen ?
Patch passes bootstrap+test on aarch64-linux-gnu.
Does the patch look OK ?

Thanks,
Prathamesh
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c91df6f5006..e5dea70e363 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   return;
 }
 
+  /* Check for interleaving case.
+ For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
+ Generate following code:
+ dup v0.h, x
+ dup v1.h, y
+ zip1 v0.h, v0.h, v1.h
+ for "large enough" initializer.  */
+
+  if (n_elts >= 8)
+{
+  int i;
+  for (i = 2; i < n_elts; i++)
+   if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
+ break;
+
+  if (i == n_elts)
+   {
+ machine_mode mode = GET_MODE (target);
+ rtx dest[2];
+
+ for (int i = 0; i < 2; i++)
+   {
+ rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 
0, i));
+ dest[i] = gen_reg_rtx (mode);
+ aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
+   }
+
+ rtvec v = gen_rtvec (2, dest[0], dest[1]);
+ emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+ return;
+   }
+}
+
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c 
b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
new file mode 100644
index 000..ee775048589
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include 
+
+/*
+** foo:
+** ...
+** dup v[0-9]+\.8h, w[0-9]+
+** dup v[0-9]+\.8h, w[0-9]+
+** zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+** ...
+** ret
+*/
+
+int16x8_t foo(int16_t x, int y)
+{
+  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; 
+  return v;
+}
+
+/*
+** foo2:
+** ...
+** dup v[0-9]+\.8h, w[0-9]+
+** moviv[0-9]+\.8h, 0x1
+** zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+** ...
+** ret
+*/
+
+int16x8_t foo2(int16_t x) 
+{
+  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; 
+  return v;
+}