Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Fri, 12 May 2023 at 00:37, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c > > b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c > > new file mode 100644 > > index 000..598a51f17c6 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c > > @@ -0,0 +1,20 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O3" } */ > > + > > +#include > > + > > +int16x8_t foo(int16_t x, int y) > > +{ > > + int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; > > + return v; > > +} > > + > > +int16x8_t foo2(int16_t x) > > +{ > > + int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; > > + return v; > > +} > > + > > +/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4h, w[0-9]+} 3 } } */ > > +/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4h, 0x1} } } */ > > +/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, > > v[0-9]+\.8h} } } */ > > Would be good to make this a scan-assembler-times ... 2. > > OK with that change. Thanks for doing this. Thanks, committed the patch in: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=8b18714fbb1ca9812b33b3de75fe6ba4a57d4946 after bootstrap+test on aarch64-linux-gnu, and verifying bootstrap passes on aarch64-linux-gnu with --enable-checking=all. Thanks, Prathamesh > > Richard
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Prathamesh Kulkarni writes: > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c > b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c > new file mode 100644 > index 000..598a51f17c6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3" } */ > + > +#include > + > +int16x8_t foo(int16_t x, int y) > +{ > + int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; > + return v; > +} > + > +int16x8_t foo2(int16_t x) > +{ > + int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; > + return v; > +} > + > +/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4h, w[0-9]+} 3 } } */ > +/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4h, 0x1} } } */ > +/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, > v[0-9]+\.8h} } } */ Would be good to make this a scan-assembler-times ... 2. OK with that change. Thanks for doing this. Richard
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Mon, 24 Apr 2023 at 15:00, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > [aarch64] Recursively intialize even and odd sub-parts and merge with zip1. > > > > gcc/ChangeLog: > > * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback): > > Rename > > aarch64_expand_vector_init to this, and remove interleaving case. > > Recursively call aarch64_expand_vector_init_fallback, instead of > > aarch64_expand_vector_init. > > (aarch64_unzip_vector_init): New function. > > (aarch64_expand_vector_init): Likewise. > > > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/ldp_stp_16.c (cons2_8_float): Adjust for new > > code-gen. > > * gcc.target/aarch64/sve/acle/general/dupq_5.c: Likewise. > > * gcc.target/aarch64/sve/acle/general/dupq_6.c: Likewise. > > * gcc.target/aarch64/vec-init-18.c: Rename interleave-init-1.c to > > this. > > * gcc.target/aarch64/vec-init-19.c: New test. > > * gcc.target/aarch64/vec-init-20.c: Likewise. > > * gcc.target/aarch64/vec-init-21.c: Likewise. > > * gcc.target/aarch64/vec-init-22-size.c: Likewise. > > * gcc.target/aarch64/vec-init-22-speed.c: Likewise. > > * gcc.target/aarch64/vec-init-22.h: New header. > > > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > > index d7e895f8d34..416e062829c 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -22026,11 +22026,12 @@ aarch64_simd_make_constant (rtx vals) > > return NULL_RTX; > > } > > > > -/* Expand a vector initialisation sequence, such that TARGET is > > - initialised to contain VALS. */ > > +/* A subroutine of aarch64_expand_vector_init, with the same interface. > > + The caller has already tried a divide-and-conquer approach, so do > > + not consider that case here. */ > > > > void > > -aarch64_expand_vector_init (rtx target, rtx vals) > > +aarch64_expand_vector_init_fallback (rtx target, rtx vals) > > { > >machine_mode mode = GET_MODE (target); > >scalar_mode inner_mode = GET_MODE_INNER (mode); > > @@ -22090,38 +22091,6 @@ aarch64_expand_vector_init (rtx target, rtx vals) > >return; > > } > > > > - /* Check for interleaving case. > > - For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. > > - Generate following code: > > - dup v0.h, x > > - dup v1.h, y > > - zip1 v0.h, v0.h, v1.h > > - for "large enough" initializer. */ > > - > > - if (n_elts >= 8) > > -{ > > - int i; > > - for (i = 2; i < n_elts; i++) > > - if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2))) > > - break; > > - > > - if (i == n_elts) > > - { > > - machine_mode mode = GET_MODE (target); > > - rtx dest[2]; > > - > > - for (int i = 0; i < 2; i++) > > - { > > - rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i)); > > - dest[i] = force_reg (mode, x); > > - } > > - > > - rtvec v = gen_rtvec (2, dest[0], dest[1]); > > - emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); > > - return; > > - } > > -} > > - > >enum insn_code icode = optab_handler (vec_set_optab, mode); > >gcc_assert (icode != CODE_FOR_nothing); > > > > @@ -22243,7 +22212,7 @@ aarch64_expand_vector_init (rtx target, rtx vals) > > } > > XVECEXP (copy, 0, i) = subst; > > } > > - aarch64_expand_vector_init (target, copy); > > + aarch64_expand_vector_init_fallback (target, copy); > > } > > > >/* Insert the variable lanes directly. */ > > @@ -22257,6 +6,81 @@ aarch64_expand_vector_init (rtx target, rtx vals) > > } > > } > > > > +/* Return even or odd half of VALS depending on EVEN_P. */ > > + > > +static rtx > > +aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p) > > +{ > > + int n = XVECLEN (vals, 0); > > + machine_mode new_mode > > += aarch64_simd_container_mode (GET_MODE_INNER (mode), > > +GET_MODE_BITSIZE (mode).to_constant () / > > 2); > > + rtvec vec = rtvec_alloc (n / 2); > > + for (int i = 0; i < n/2; i++) > > Formatting nit: n / 2 > > > +RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i) > > + : XVECEXP (vals, 0, 2 * i + 1); > > + return gen_rtx_PARALLEL (new_mode, vec); > > +} > > + > > +/* Expand a vector initialisation sequence, such that TARGET is > > initialization > > > + initialized to contain VALS. */ > > + > > +void > > +aarch64_expand_vector_init (rtx target, rtx vals) > > +{ > > + /* Try decomposing the initializer into even and odd halves and > > + then ZIP them together. Use the resulting sequence if it is > > + strictly cheaper than loading VALS directly. > > + > > + Prefer the fallback sequence in the event of a tie, since it > > + will tend to use fewer registers. */ > > + > > + machi
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Prathamesh Kulkarni writes: > [aarch64] Recursively intialize even and odd sub-parts and merge with zip1. > > gcc/ChangeLog: > * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback): > Rename > aarch64_expand_vector_init to this, and remove interleaving case. > Recursively call aarch64_expand_vector_init_fallback, instead of > aarch64_expand_vector_init. > (aarch64_unzip_vector_init): New function. > (aarch64_expand_vector_init): Likewise. > > gcc/testsuite/ChangeLog: > * gcc.target/aarch64/ldp_stp_16.c (cons2_8_float): Adjust for new > code-gen. > * gcc.target/aarch64/sve/acle/general/dupq_5.c: Likewise. > * gcc.target/aarch64/sve/acle/general/dupq_6.c: Likewise. > * gcc.target/aarch64/vec-init-18.c: Rename interleave-init-1.c to > this. > * gcc.target/aarch64/vec-init-19.c: New test. > * gcc.target/aarch64/vec-init-20.c: Likewise. > * gcc.target/aarch64/vec-init-21.c: Likewise. > * gcc.target/aarch64/vec-init-22-size.c: Likewise. > * gcc.target/aarch64/vec-init-22-speed.c: Likewise. > * gcc.target/aarch64/vec-init-22.h: New header. > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index d7e895f8d34..416e062829c 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -22026,11 +22026,12 @@ aarch64_simd_make_constant (rtx vals) > return NULL_RTX; > } > > -/* Expand a vector initialisation sequence, such that TARGET is > - initialised to contain VALS. */ > +/* A subroutine of aarch64_expand_vector_init, with the same interface. > + The caller has already tried a divide-and-conquer approach, so do > + not consider that case here. */ > > void > -aarch64_expand_vector_init (rtx target, rtx vals) > +aarch64_expand_vector_init_fallback (rtx target, rtx vals) > { >machine_mode mode = GET_MODE (target); >scalar_mode inner_mode = GET_MODE_INNER (mode); > @@ -22090,38 +22091,6 @@ aarch64_expand_vector_init (rtx target, rtx vals) >return; > } > > - /* Check for interleaving case. > - For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. > - Generate following code: > - dup v0.h, x > - dup v1.h, y > - zip1 v0.h, v0.h, v1.h > - for "large enough" initializer. */ > - > - if (n_elts >= 8) > -{ > - int i; > - for (i = 2; i < n_elts; i++) > - if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2))) > - break; > - > - if (i == n_elts) > - { > - machine_mode mode = GET_MODE (target); > - rtx dest[2]; > - > - for (int i = 0; i < 2; i++) > - { > - rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i)); > - dest[i] = force_reg (mode, x); > - } > - > - rtvec v = gen_rtvec (2, dest[0], dest[1]); > - emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); > - return; > - } > -} > - >enum insn_code icode = optab_handler (vec_set_optab, mode); >gcc_assert (icode != CODE_FOR_nothing); > > @@ -22243,7 +22212,7 @@ aarch64_expand_vector_init (rtx target, rtx vals) > } > XVECEXP (copy, 0, i) = subst; > } > - aarch64_expand_vector_init (target, copy); > + aarch64_expand_vector_init_fallback (target, copy); > } > >/* Insert the variable lanes directly. */ > @@ -22257,6 +6,81 @@ aarch64_expand_vector_init (rtx target, rtx vals) > } > } > > +/* Return even or odd half of VALS depending on EVEN_P. */ > + > +static rtx > +aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p) > +{ > + int n = XVECLEN (vals, 0); > + machine_mode new_mode > += aarch64_simd_container_mode (GET_MODE_INNER (mode), > +GET_MODE_BITSIZE (mode).to_constant () / 2); > + rtvec vec = rtvec_alloc (n / 2); > + for (int i = 0; i < n/2; i++) Formatting nit: n / 2 > +RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i) > + : XVECEXP (vals, 0, 2 * i + 1); > + return gen_rtx_PARALLEL (new_mode, vec); > +} > + > +/* Expand a vector initialisation sequence, such that TARGET is initialization > + initialized to contain VALS. */ > + > +void > +aarch64_expand_vector_init (rtx target, rtx vals) > +{ > + /* Try decomposing the initializer into even and odd halves and > + then ZIP them together. Use the resulting sequence if it is > + strictly cheaper than loading VALS directly. > + > + Prefer the fallback sequence in the event of a tie, since it > + will tend to use fewer registers. */ > + > + machine_mode mode = GET_MODE (target); > + int n_elts = XVECLEN (vals, 0); > + > + if (n_elts < 4 > + || maybe_ne (GET_MODE_BITSIZE (mode), 128)) > +{ > + aarch64_expand_vector_init_fallback (target, vals); > + return; > +} > + > + start_sequence (); > + rtx halves[2]; > + unsigned costs[2];
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Fri, 21 Apr 2023 at 20:45, Prathamesh Kulkarni wrote: > > On Fri, 21 Apr 2023 at 14:47, Richard Sandiford > wrote: > > > > Prathamesh Kulkarni writes: > > > Hi, > > > I tested the interleave+zip1 for vector init patch and it segfaulted > > > during bootstrap while trying to build > > > libgfortran/generated/matmul_i2.c. > > > Rebuilding with --enable-checking=rtl showed out of bounds access in > > > aarch64_unzip_vector_init in following hunk: > > > > > > + rtvec vec = rtvec_alloc (n / 2); > > > + for (int i = 0; i < n; i++) > > > +RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i) > > > + : XVECEXP (vals, 0, 2 * i + 1); > > > > > > which is incorrect since it allocates n/2 but iterates and stores upto n. > > > The attached patch fixes the issue, which passed bootstrap, however > > > resulted in following fallout during testsuite run: > > > > > > 1] sve/acle/general/dupq_[1-4].c tests fail. > > > For the following test: > > > int32x4_t f(int32_t x) > > > { > > > return (int32x4_t) { x, 1, 2, 3 }; > > > } > > > > > > Code-gen without patch: > > > f: > > > adrpx1, .LC0 > > > ldr q0, [x1, #:lo12:.LC0] > > > ins v0.s[0], w0 > > > ret > > > > > > Code-gen with patch: > > > f: > > > moviv0.2s, 0x2 > > > adrpx1, .LC0 > > > ldr d1, [x1, #:lo12:.LC0] > > > ins v0.s[0], w0 > > > zip1v0.4s, v0.4s, v1.4s > > > ret > > > > > > It shows, fallback_seq_cost = 20, seq_total_cost = 16 > > > where seq_total_cost determines the cost for interleave+zip1 sequence > > > and fallback_seq_cost is the cost for fallback sequence. > > > Altho it shows lesser cost, I am not sure if the interleave+zip1 > > > sequence is better in this case ? > > > > Debugging the patch, it looks like this is because the fallback sequence > > contains a redundant pseudo-to-pseudo move, which is costed as 1 > > instruction (4 units). The RTL equivalent of the: > > > > moviv0.2s, 0x2 > > ins v0.s[0], w0 > > > > has a similar redundant move, but the cost of that move is subsumed by > > the cost of the other arm (the load from LC0), which is costed as 3 > > instructions (12 units). So we have 12 + 4 for the parallel version > > (correct) but 12 + 4 + 4 for the serial version (one instruction too > > many). > > > > The reason we have redundant moves is that the expansion code uses > > copy_to_mode_reg to force a value into a register. This creates a > > new pseudo even if the original value was already a register. > > Using force_reg removes the moves and makes the test pass. > > > > So I think the first step is to use force_reg instead of > > copy_to_mode_reg in aarch64_simd_dup_constant and > > aarch64_expand_vector_init (as a preparatory patch). > Thanks for the clarification! > > > > > 2] sve/acle/general/dupq_[5-6].c tests fail: > > > int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3) > > > { > > > return (int32x4_t) { x0, x1, x2, x3 }; > > > } > > > > > > code-gen without patch: > > > f: > > > fmovs0, w0 > > > ins v0.s[1], w1 > > > ins v0.s[2], w2 > > > ins v0.s[3], w3 > > > ret > > > > > > code-gen with patch: > > > f: > > > fmovs0, w0 > > > fmovs1, w1 > > > ins v0.s[1], w2 > > > ins v1.s[1], w3 > > > zip1v0.4s, v0.4s, v1.4s > > > ret > > > > > > It shows fallback_seq_cost = 28, seq_total_cost = 16 > > > > The zip verson still wins after the fix above, but by a lesser amount. > > It seems like a borderline case. > > > > > > > > 3] aarch64/ldp_stp_16.c's cons2_8_float test fails. > > > Test case: > > > void cons2_8_float(float *x, float val0, float val1) > > > { > > > #pragma GCC unroll(8) > > > for (int i = 0; i < 8 * 2; i += 2) { > > > x[i + 0] = val0; > > > x[i + 1] = val1; > > > } > > > } > > > > > > which is lowered to: > > > void cons2_8_float (float * x, float val0, float val1) > > > { > > > vector(4) float _86; > > > > > >[local count: 119292720]: > > > _86 = {val0_11(D), val1_13(D), val0_11(D), val1_13(D)}; > > > MEM [(float *)x_10(D)] = _86; > > > MEM [(float *)x_10(D) + 16B] = _86; > > > MEM [(float *)x_10(D) + 32B] = _86; > > > MEM [(float *)x_10(D) + 48B] = _86; > > > return; > > > } > > > > > > code-gen without patch: > > > cons2_8_float: > > > dup v0.4s, v0.s[0] > > > ins v0.s[1], v1.s[0] > > > ins v0.s[3], v1.s[0] > > > stp q0, q0, [x0] > > > stp q0, q0, [x0, 32] > > > ret > > > > > > code-gen with patch: > > > cons2_8_float: > > > dup v1.2s, v1.s[0] > > > dup v0.2s, v0.s[0] > > > zip1v0.4s, v0.4s, v1.4s > > > stp q0, q0, [x0] > > > stp q0, q0, [x0, 32] > > > ret > > > > > > It shows fallback_seq_cost = 28, seq_total_cost = 16 > > > > > > I think the test
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Fri, 21 Apr 2023 at 14:47, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > Hi, > > I tested the interleave+zip1 for vector init patch and it segfaulted > > during bootstrap while trying to build > > libgfortran/generated/matmul_i2.c. > > Rebuilding with --enable-checking=rtl showed out of bounds access in > > aarch64_unzip_vector_init in following hunk: > > > > + rtvec vec = rtvec_alloc (n / 2); > > + for (int i = 0; i < n; i++) > > +RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i) > > + : XVECEXP (vals, 0, 2 * i + 1); > > > > which is incorrect since it allocates n/2 but iterates and stores upto n. > > The attached patch fixes the issue, which passed bootstrap, however > > resulted in following fallout during testsuite run: > > > > 1] sve/acle/general/dupq_[1-4].c tests fail. > > For the following test: > > int32x4_t f(int32_t x) > > { > > return (int32x4_t) { x, 1, 2, 3 }; > > } > > > > Code-gen without patch: > > f: > > adrpx1, .LC0 > > ldr q0, [x1, #:lo12:.LC0] > > ins v0.s[0], w0 > > ret > > > > Code-gen with patch: > > f: > > moviv0.2s, 0x2 > > adrpx1, .LC0 > > ldr d1, [x1, #:lo12:.LC0] > > ins v0.s[0], w0 > > zip1v0.4s, v0.4s, v1.4s > > ret > > > > It shows, fallback_seq_cost = 20, seq_total_cost = 16 > > where seq_total_cost determines the cost for interleave+zip1 sequence > > and fallback_seq_cost is the cost for fallback sequence. > > Altho it shows lesser cost, I am not sure if the interleave+zip1 > > sequence is better in this case ? > > Debugging the patch, it looks like this is because the fallback sequence > contains a redundant pseudo-to-pseudo move, which is costed as 1 > instruction (4 units). The RTL equivalent of the: > > moviv0.2s, 0x2 > ins v0.s[0], w0 > > has a similar redundant move, but the cost of that move is subsumed by > the cost of the other arm (the load from LC0), which is costed as 3 > instructions (12 units). So we have 12 + 4 for the parallel version > (correct) but 12 + 4 + 4 for the serial version (one instruction too > many). > > The reason we have redundant moves is that the expansion code uses > copy_to_mode_reg to force a value into a register. This creates a > new pseudo even if the original value was already a register. > Using force_reg removes the moves and makes the test pass. > > So I think the first step is to use force_reg instead of > copy_to_mode_reg in aarch64_simd_dup_constant and > aarch64_expand_vector_init (as a preparatory patch). Thanks for the clarification! > > > 2] sve/acle/general/dupq_[5-6].c tests fail: > > int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3) > > { > > return (int32x4_t) { x0, x1, x2, x3 }; > > } > > > > code-gen without patch: > > f: > > fmovs0, w0 > > ins v0.s[1], w1 > > ins v0.s[2], w2 > > ins v0.s[3], w3 > > ret > > > > code-gen with patch: > > f: > > fmovs0, w0 > > fmovs1, w1 > > ins v0.s[1], w2 > > ins v1.s[1], w3 > > zip1v0.4s, v0.4s, v1.4s > > ret > > > > It shows fallback_seq_cost = 28, seq_total_cost = 16 > > The zip verson still wins after the fix above, but by a lesser amount. > It seems like a borderline case. > > > > > 3] aarch64/ldp_stp_16.c's cons2_8_float test fails. > > Test case: > > void cons2_8_float(float *x, float val0, float val1) > > { > > #pragma GCC unroll(8) > > for (int i = 0; i < 8 * 2; i += 2) { > > x[i + 0] = val0; > > x[i + 1] = val1; > > } > > } > > > > which is lowered to: > > void cons2_8_float (float * x, float val0, float val1) > > { > > vector(4) float _86; > > > >[local count: 119292720]: > > _86 = {val0_11(D), val1_13(D), val0_11(D), val1_13(D)}; > > MEM [(float *)x_10(D)] = _86; > > MEM [(float *)x_10(D) + 16B] = _86; > > MEM [(float *)x_10(D) + 32B] = _86; > > MEM [(float *)x_10(D) + 48B] = _86; > > return; > > } > > > > code-gen without patch: > > cons2_8_float: > > dup v0.4s, v0.s[0] > > ins v0.s[1], v1.s[0] > > ins v0.s[3], v1.s[0] > > stp q0, q0, [x0] > > stp q0, q0, [x0, 32] > > ret > > > > code-gen with patch: > > cons2_8_float: > > dup v1.2s, v1.s[0] > > dup v0.2s, v0.s[0] > > zip1v0.4s, v0.4s, v1.4s > > stp q0, q0, [x0] > > stp q0, q0, [x0, 32] > > ret > > > > It shows fallback_seq_cost = 28, seq_total_cost = 16 > > > > I think the test fails because it doesn't match: > > ** dup v([0-9]+)\.4s, .* > > > > Shall it be OK to amend the test assuming code-gen with patch is better ? > > Yeah, the new code seems like an improvement. > > > 4] aarch64/pr109072_1.c s32x4_3 test fails: > > For the following test: > > int32x4_t s32x4_3 (int32_t x, int32_t y) > > { > > int32_t arr[] = {
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Prathamesh Kulkarni writes: > Hi, > I tested the interleave+zip1 for vector init patch and it segfaulted > during bootstrap while trying to build > libgfortran/generated/matmul_i2.c. > Rebuilding with --enable-checking=rtl showed out of bounds access in > aarch64_unzip_vector_init in following hunk: > > + rtvec vec = rtvec_alloc (n / 2); > + for (int i = 0; i < n; i++) > +RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i) > + : XVECEXP (vals, 0, 2 * i + 1); > > which is incorrect since it allocates n/2 but iterates and stores upto n. > The attached patch fixes the issue, which passed bootstrap, however > resulted in following fallout during testsuite run: > > 1] sve/acle/general/dupq_[1-4].c tests fail. > For the following test: > int32x4_t f(int32_t x) > { > return (int32x4_t) { x, 1, 2, 3 }; > } > > Code-gen without patch: > f: > adrpx1, .LC0 > ldr q0, [x1, #:lo12:.LC0] > ins v0.s[0], w0 > ret > > Code-gen with patch: > f: > moviv0.2s, 0x2 > adrpx1, .LC0 > ldr d1, [x1, #:lo12:.LC0] > ins v0.s[0], w0 > zip1v0.4s, v0.4s, v1.4s > ret > > It shows, fallback_seq_cost = 20, seq_total_cost = 16 > where seq_total_cost determines the cost for interleave+zip1 sequence > and fallback_seq_cost is the cost for fallback sequence. > Altho it shows lesser cost, I am not sure if the interleave+zip1 > sequence is better in this case ? Debugging the patch, it looks like this is because the fallback sequence contains a redundant pseudo-to-pseudo move, which is costed as 1 instruction (4 units). The RTL equivalent of the: moviv0.2s, 0x2 ins v0.s[0], w0 has a similar redundant move, but the cost of that move is subsumed by the cost of the other arm (the load from LC0), which is costed as 3 instructions (12 units). So we have 12 + 4 for the parallel version (correct) but 12 + 4 + 4 for the serial version (one instruction too many). The reason we have redundant moves is that the expansion code uses copy_to_mode_reg to force a value into a register. This creates a new pseudo even if the original value was already a register. Using force_reg removes the moves and makes the test pass. So I think the first step is to use force_reg instead of copy_to_mode_reg in aarch64_simd_dup_constant and aarch64_expand_vector_init (as a preparatory patch). > 2] sve/acle/general/dupq_[5-6].c tests fail: > int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3) > { > return (int32x4_t) { x0, x1, x2, x3 }; > } > > code-gen without patch: > f: > fmovs0, w0 > ins v0.s[1], w1 > ins v0.s[2], w2 > ins v0.s[3], w3 > ret > > code-gen with patch: > f: > fmovs0, w0 > fmovs1, w1 > ins v0.s[1], w2 > ins v1.s[1], w3 > zip1v0.4s, v0.4s, v1.4s > ret > > It shows fallback_seq_cost = 28, seq_total_cost = 16 The zip verson still wins after the fix above, but by a lesser amount. It seems like a borderline case. > > 3] aarch64/ldp_stp_16.c's cons2_8_float test fails. > Test case: > void cons2_8_float(float *x, float val0, float val1) > { > #pragma GCC unroll(8) > for (int i = 0; i < 8 * 2; i += 2) { > x[i + 0] = val0; > x[i + 1] = val1; > } > } > > which is lowered to: > void cons2_8_float (float * x, float val0, float val1) > { > vector(4) float _86; > >[local count: 119292720]: > _86 = {val0_11(D), val1_13(D), val0_11(D), val1_13(D)}; > MEM [(float *)x_10(D)] = _86; > MEM [(float *)x_10(D) + 16B] = _86; > MEM [(float *)x_10(D) + 32B] = _86; > MEM [(float *)x_10(D) + 48B] = _86; > return; > } > > code-gen without patch: > cons2_8_float: > dup v0.4s, v0.s[0] > ins v0.s[1], v1.s[0] > ins v0.s[3], v1.s[0] > stp q0, q0, [x0] > stp q0, q0, [x0, 32] > ret > > code-gen with patch: > cons2_8_float: > dup v1.2s, v1.s[0] > dup v0.2s, v0.s[0] > zip1v0.4s, v0.4s, v1.4s > stp q0, q0, [x0] > stp q0, q0, [x0, 32] > ret > > It shows fallback_seq_cost = 28, seq_total_cost = 16 > > I think the test fails because it doesn't match: > ** dup v([0-9]+)\.4s, .* > > Shall it be OK to amend the test assuming code-gen with patch is better ? Yeah, the new code seems like an improvement. > 4] aarch64/pr109072_1.c s32x4_3 test fails: > For the following test: > int32x4_t s32x4_3 (int32_t x, int32_t y) > { > int32_t arr[] = { x, y, y, y }; > return vld1q_s32 (arr); > } > > code-gen without patch: > s32x4_3: > dup v0.4s, w1 > ins v0.s[0], w0 > ret > > code-gen with patch: > s32x4_3: > fmovs1, w1 > fmovs0, w0 > ins v0.s[1], v1.s[0] > dup v1.2s, v1.s[0] > zip1v0.4s, v0.4s, v1.4s > ret > > It shows fallback_
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Wed, 12 Apr 2023 at 14:29, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > On Thu, 6 Apr 2023 at 16:05, Richard Sandiford > > wrote: > >> > >> Prathamesh Kulkarni writes: > >> > On Tue, 4 Apr 2023 at 23:35, Richard Sandiford > >> > wrote: > >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> >> > index cd9cace3c9b..3de79060619 100644 > >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> >> > @@ -817,6 +817,62 @@ public: > >> >> > > >> >> > class svdupq_impl : public quiet > >> >> > { > >> >> > +private: > >> >> > + gimple * > >> >> > + fold_nonconst_dupq (gimple_folder &f, unsigned factor) const > >> >> > + { > >> >> > +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into: > >> >> > + tmp = {arg0, arg1, ..., arg} > >> >> > + lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...}) */ > >> >> > + > >> >> > +/* TODO: Revisit to handle factor by padding zeros. */ > >> >> > +if (factor > 1) > >> >> > + return NULL; > >> >> > >> >> Isn't the key thing here predicate vs. vector rather than factor == 1 > >> >> vs. > >> >> factor != 1? Do we generate good code for b8, where factor should be 1? > >> > Hi, > >> > It generates the following code for svdup_n_b8: > >> > https://pastebin.com/ypYt590c > >> > >> Hmm, yeah, not pretty :-) But it's not pretty without either. > >> > >> > I suppose lowering to ctor+vec_perm_expr is not really useful > >> > for this case because it won't simplify ctor, unlike the above case of > >> > svdupq_s32 (x[0], x[1], x[2], x[3]); > >> > However I wonder if it's still a good idea to lower svdupq for > >> > predicates, for > >> > representing svdupq (or other intrinsics) using GIMPLE constructs as > >> > far as possible ? > >> > >> It's possible, but I think we'd need an example in which its a clear > >> benefit. > > Sorry I posted for wrong test case above. > > For the following test: > > svbool_t f(uint8x16_t x) > > { > > return svdupq_n_b8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], > > x[8], x[9], x[10], x[11], x[12], > > x[13], x[14], x[15]); > > } > > > > Code-gen: > > https://pastebin.com/maexgeJn > > > > I suppose it's equivalent to following ? > > > > svbool_t f2(uint8x16_t x) > > { > > svuint8_t tmp = svdupq_n_u8 ((bool) x[0], (bool) x[1], (bool) x[2], > > (bool) x[3], > >(bool) x[4], (bool) x[5], (bool) x[6], > > (bool) x[7], > >(bool) x[8], (bool) x[9], (bool) x[10], > > (bool) x[11], > >(bool) x[12], (bool) x[13], (bool) > > x[14], (bool) x[15]); > > return svcmpne_n_u8 (svptrue_b8 (), tmp, 0); > > } > > Yeah, this is essentially the transformation that the svdupq rtl > expander uses. It would probably be a good idea to do that in > gimple too. Hi, I tested the interleave+zip1 for vector init patch and it segfaulted during bootstrap while trying to build libgfortran/generated/matmul_i2.c. Rebuilding with --enable-checking=rtl showed out of bounds access in aarch64_unzip_vector_init in following hunk: + rtvec vec = rtvec_alloc (n / 2); + for (int i = 0; i < n; i++) +RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i) + : XVECEXP (vals, 0, 2 * i + 1); which is incorrect since it allocates n/2 but iterates and stores upto n. The attached patch fixes the issue, which passed bootstrap, however resulted in following fallout during testsuite run: 1] sve/acle/general/dupq_[1-4].c tests fail. For the following test: int32x4_t f(int32_t x) { return (int32x4_t) { x, 1, 2, 3 }; } Code-gen without patch: f: adrpx1, .LC0 ldr q0, [x1, #:lo12:.LC0] ins v0.s[0], w0 ret Code-gen with patch: f: moviv0.2s, 0x2 adrpx1, .LC0 ldr d1, [x1, #:lo12:.LC0] ins v0.s[0], w0 zip1v0.4s, v0.4s, v1.4s ret It shows, fallback_seq_cost = 20, seq_total_cost = 16 where seq_total_cost determines the cost for interleave+zip1 sequence and fallback_seq_cost is the cost for fallback sequence. Altho it shows lesser cost, I am not sure if the interleave+zip1 sequence is better in this case ? 2] sve/acle/general/dupq_[5-6].c tests fail: int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3) { return (int32x4_t) { x0, x1, x2, x3 }; } code-gen without patch: f: fmovs0, w0 ins v0.s[1], w1 ins v0.s[2], w2 ins v0.s[3], w3 ret code-gen with patch: f: fmovs0, w0 fmovs1, w1 ins v0.s[1], w2 ins v1.s[1], w3 zip1v0.4s, v0.4s, v1.4s ret It shows fallback_seq_cost = 28, seq_total_cost = 16 3] aarch64/ldp_stp_16.c's cons2_8_float test fails. Test case: void cons2_8_float(float *x, float va
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Prathamesh Kulkarni writes: > On Thu, 6 Apr 2023 at 16:05, Richard Sandiford > wrote: >> >> Prathamesh Kulkarni writes: >> > On Tue, 4 Apr 2023 at 23:35, Richard Sandiford >> > wrote: >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> >> > index cd9cace3c9b..3de79060619 100644 >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> >> > @@ -817,6 +817,62 @@ public: >> >> > >> >> > class svdupq_impl : public quiet >> >> > { >> >> > +private: >> >> > + gimple * >> >> > + fold_nonconst_dupq (gimple_folder &f, unsigned factor) const >> >> > + { >> >> > +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into: >> >> > + tmp = {arg0, arg1, ..., arg} >> >> > + lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...}) */ >> >> > + >> >> > +/* TODO: Revisit to handle factor by padding zeros. */ >> >> > +if (factor > 1) >> >> > + return NULL; >> >> >> >> Isn't the key thing here predicate vs. vector rather than factor == 1 vs. >> >> factor != 1? Do we generate good code for b8, where factor should be 1? >> > Hi, >> > It generates the following code for svdup_n_b8: >> > https://pastebin.com/ypYt590c >> >> Hmm, yeah, not pretty :-) But it's not pretty without either. >> >> > I suppose lowering to ctor+vec_perm_expr is not really useful >> > for this case because it won't simplify ctor, unlike the above case of >> > svdupq_s32 (x[0], x[1], x[2], x[3]); >> > However I wonder if it's still a good idea to lower svdupq for predicates, >> > for >> > representing svdupq (or other intrinsics) using GIMPLE constructs as >> > far as possible ? >> >> It's possible, but I think we'd need an example in which its a clear >> benefit. > Sorry I posted for wrong test case above. > For the following test: > svbool_t f(uint8x16_t x) > { > return svdupq_n_b8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], > x[8], x[9], x[10], x[11], x[12], > x[13], x[14], x[15]); > } > > Code-gen: > https://pastebin.com/maexgeJn > > I suppose it's equivalent to following ? > > svbool_t f2(uint8x16_t x) > { > svuint8_t tmp = svdupq_n_u8 ((bool) x[0], (bool) x[1], (bool) x[2], > (bool) x[3], >(bool) x[4], (bool) x[5], (bool) x[6], > (bool) x[7], >(bool) x[8], (bool) x[9], (bool) x[10], > (bool) x[11], >(bool) x[12], (bool) x[13], (bool) > x[14], (bool) x[15]); > return svcmpne_n_u8 (svptrue_b8 (), tmp, 0); > } Yeah, this is essentially the transformation that the svdupq rtl expander uses. It would probably be a good idea to do that in gimple too. Thanks, Richard > > which generates: > f2: > .LFB3901: > .cfi_startproc > moviv1.16b, 0x1 > ptrue p0.b, all > cmeqv0.16b, v0.16b, #0 > bic v0.16b, v1.16b, v0.16b > dup z0.q, z0.q[0] > cmpne p0.b, p0/z, z0.b, #0 > ret > > Thanks, > Prathamesh
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Thu, 6 Apr 2023 at 16:05, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > On Tue, 4 Apr 2023 at 23:35, Richard Sandiford > > wrote: > >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> > index cd9cace3c9b..3de79060619 100644 > >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> > @@ -817,6 +817,62 @@ public: > >> > > >> > class svdupq_impl : public quiet > >> > { > >> > +private: > >> > + gimple * > >> > + fold_nonconst_dupq (gimple_folder &f, unsigned factor) const > >> > + { > >> > +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into: > >> > + tmp = {arg0, arg1, ..., arg} > >> > + lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...}) */ > >> > + > >> > +/* TODO: Revisit to handle factor by padding zeros. */ > >> > +if (factor > 1) > >> > + return NULL; > >> > >> Isn't the key thing here predicate vs. vector rather than factor == 1 vs. > >> factor != 1? Do we generate good code for b8, where factor should be 1? > > Hi, > > It generates the following code for svdup_n_b8: > > https://pastebin.com/ypYt590c > > Hmm, yeah, not pretty :-) But it's not pretty without either. > > > I suppose lowering to ctor+vec_perm_expr is not really useful > > for this case because it won't simplify ctor, unlike the above case of > > svdupq_s32 (x[0], x[1], x[2], x[3]); > > However I wonder if it's still a good idea to lower svdupq for predicates, > > for > > representing svdupq (or other intrinsics) using GIMPLE constructs as > > far as possible ? > > It's possible, but I think we'd need an example in which its a clear > benefit. Sorry I posted for wrong test case above. For the following test: svbool_t f(uint8x16_t x) { return svdupq_n_b8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]); } Code-gen: https://pastebin.com/maexgeJn I suppose it's equivalent to following ? svbool_t f2(uint8x16_t x) { svuint8_t tmp = svdupq_n_u8 ((bool) x[0], (bool) x[1], (bool) x[2], (bool) x[3], (bool) x[4], (bool) x[5], (bool) x[6], (bool) x[7], (bool) x[8], (bool) x[9], (bool) x[10], (bool) x[11], (bool) x[12], (bool) x[13], (bool) x[14], (bool) x[15]); return svcmpne_n_u8 (svptrue_b8 (), tmp, 0); } which generates: f2: .LFB3901: .cfi_startproc moviv1.16b, 0x1 ptrue p0.b, all cmeqv0.16b, v0.16b, #0 bic v0.16b, v1.16b, v0.16b dup z0.q, z0.q[0] cmpne p0.b, p0/z, z0.b, #0 ret Thanks, Prathamesh > > > In the attached patch, it simply punts if the type > > suffix is b, > > and doesn't try to fold the call. > > Yeah, think that's best for now. > > >> > + > >> > +if (BYTES_BIG_ENDIAN) > >> > + return NULL; > >> > + > >> > +tree lhs = gimple_call_lhs (f.call); > >> > +if (TREE_CODE (lhs) != SSA_NAME) > >> > + return NULL; > >> > >> Why is this check needed? > > This was a left-over from something else I was doing wrongly. Sorry I > > forgot to remove it. > >> > >> > +tree lhs_type = TREE_TYPE (lhs); > >> > +tree elt_type = TREE_TYPE (lhs_type); > >> > +scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type)); > >> > >> Aren't we already dealing with a scalar type here? I'd have expected > >> SCALAR_TYPE_MODE rather than GET_MODE_INNER (TYPE_MODE ...). > > Ugh, sorry, I had most of the code copied over from svld1rq_impl for > > building VEC_PERM_EXPR with VLA mask and adjusted it, > > but overlooked this :/ > >> > >> > +machine_mode vq_mode = aarch64_vq_mode (elt_mode).require (); > >> > +tree vq_type = build_vector_type_for_mode (elt_type, vq_mode); > >> > + > >> > +unsigned nargs = gimple_call_num_args (f.call); > >> > +vec *v; > >> > +vec_alloc (v, nargs); > >> > +for (unsigned i = 0; i < nargs; i++) > >> > + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, > >> > i)); > >> > +tree vec = build_constructor (vq_type, v); > >> > + > >> > +tree access_type > >> > + = build_aligned_type (vq_type, TYPE_ALIGN (elt_type)); > >> > >> Nit: seems to fit on one line. But do we need this? We're not accessing > >> memory, so I'd have expected vq_type to be OK as-is. > >> > >> > +tree tmp = make_ssa_name_fn (cfun, access_type, 0); > >> > +gimple *g = gimple_build_assign (tmp, vec); > >> > + > >> > +gimple_seq stmts = NULL; > >> > +gimple_seq_add_stmt_without_update (&stmts, g); > >> > + > >> > +int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant > >> > (); > >> > >> Looks like we should be able to use nargs instead of source_nelts. > > Does the attached patch look OK ? > > > > Thanks, > > Prathamesh > >> > > > >> Thanks, > >> Richard > >> > >> > +poly_ui
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Prathamesh Kulkarni writes: > On Tue, 4 Apr 2023 at 23:35, Richard Sandiford > wrote: >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > index cd9cace3c9b..3de79060619 100644 >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > @@ -817,6 +817,62 @@ public: >> > >> > class svdupq_impl : public quiet >> > { >> > +private: >> > + gimple * >> > + fold_nonconst_dupq (gimple_folder &f, unsigned factor) const >> > + { >> > +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into: >> > + tmp = {arg0, arg1, ..., arg} >> > + lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...}) */ >> > + >> > +/* TODO: Revisit to handle factor by padding zeros. */ >> > +if (factor > 1) >> > + return NULL; >> >> Isn't the key thing here predicate vs. vector rather than factor == 1 vs. >> factor != 1? Do we generate good code for b8, where factor should be 1? > Hi, > It generates the following code for svdup_n_b8: > https://pastebin.com/ypYt590c Hmm, yeah, not pretty :-) But it's not pretty without either. > I suppose lowering to ctor+vec_perm_expr is not really useful > for this case because it won't simplify ctor, unlike the above case of > svdupq_s32 (x[0], x[1], x[2], x[3]); > However I wonder if it's still a good idea to lower svdupq for predicates, for > representing svdupq (or other intrinsics) using GIMPLE constructs as > far as possible ? It's possible, but I think we'd need an example in which its a clear benefit. > In the attached patch, it simply punts if the type > suffix is b, > and doesn't try to fold the call. Yeah, think that's best for now. >> > + >> > +if (BYTES_BIG_ENDIAN) >> > + return NULL; >> > + >> > +tree lhs = gimple_call_lhs (f.call); >> > +if (TREE_CODE (lhs) != SSA_NAME) >> > + return NULL; >> >> Why is this check needed? > This was a left-over from something else I was doing wrongly. Sorry I > forgot to remove it. >> >> > +tree lhs_type = TREE_TYPE (lhs); >> > +tree elt_type = TREE_TYPE (lhs_type); >> > +scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type)); >> >> Aren't we already dealing with a scalar type here? I'd have expected >> SCALAR_TYPE_MODE rather than GET_MODE_INNER (TYPE_MODE ...). > Ugh, sorry, I had most of the code copied over from svld1rq_impl for > building VEC_PERM_EXPR with VLA mask and adjusted it, > but overlooked this :/ >> >> > +machine_mode vq_mode = aarch64_vq_mode (elt_mode).require (); >> > +tree vq_type = build_vector_type_for_mode (elt_type, vq_mode); >> > + >> > +unsigned nargs = gimple_call_num_args (f.call); >> > +vec *v; >> > +vec_alloc (v, nargs); >> > +for (unsigned i = 0; i < nargs; i++) >> > + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i)); >> > +tree vec = build_constructor (vq_type, v); >> > + >> > +tree access_type >> > + = build_aligned_type (vq_type, TYPE_ALIGN (elt_type)); >> >> Nit: seems to fit on one line. But do we need this? We're not accessing >> memory, so I'd have expected vq_type to be OK as-is. >> >> > +tree tmp = make_ssa_name_fn (cfun, access_type, 0); >> > +gimple *g = gimple_build_assign (tmp, vec); >> > + >> > +gimple_seq stmts = NULL; >> > +gimple_seq_add_stmt_without_update (&stmts, g); >> > + >> > +int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant (); >> >> Looks like we should be able to use nargs instead of source_nelts. > Does the attached patch look OK ? > > Thanks, > Prathamesh >> > >> Thanks, >> Richard >> >> > +poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type); >> > +vec_perm_builder sel (lhs_len, source_nelts, 1); >> > +for (int i = 0; i < source_nelts; i++) >> > + sel.quick_push (i); >> > + >> > +vec_perm_indices indices (sel, 1, source_nelts); >> > +tree mask_type = build_vector_type (ssizetype, lhs_len); >> > +tree mask = vec_perm_indices_to_tree (mask_type, indices); >> > + >> > +gimple *g2 = gimple_build_assign (lhs, VEC_PERM_EXPR, tmp, tmp, mask); >> > +gimple_seq_add_stmt_without_update (&stmts, g2); >> > +gsi_replace_with_seq (f.gsi, stmts, false); >> > +return g2; >> > + } >> > + >> > public: >> >gimple * >> >fold (gimple_folder &f) const override >> > @@ -832,7 +888,7 @@ public: >> >{ >> > tree elt = gimple_call_arg (f.call, i); >> > if (!CONSTANT_CLASS_P (elt)) >> > - return NULL; >> > + return fold_nonconst_dupq (f, factor); >> > builder.quick_push (elt); >> > for (unsigned int j = 1; j < factor; ++j) >> > builder.quick_push (build_zero_cst (TREE_TYPE (vec_type))); >> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c >> > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c >> > new file mode 100644 >> > index 000..f19f8deb1e5 >> > --- /dev/null >> > ++
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Tue, 4 Apr 2023 at 23:35, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > On Mon, 13 Mar 2023 at 13:03, Richard Biener wrote: > >> On GIMPLE it would be > >> > >> _1 = { a, ... }; // (a) > >> _2 = { _1, ... }; // (b) > >> > >> but I'm not sure if (b), a VL CTOR of fixed len(?) sub-vectors is > >> possible? But at least a CTOR of vectors is what we use to > >> concat vectors. > >> > >> With the recent relaxing of VEC_PERM inputs it's also possible to > >> express (b) with a VEC_PERM: > >> > >> _2 = VEC_PERM <_1, _1, { 0, 1, 2, 3, 0, 1, 2, 3, ... }> > >> > >> but again I'm not sure if that repeating 0, 1, 2, 3 is expressible > >> for VL vectors (maybe we'd allow "wrapping" here, I'm not sure). > >> > > Hi, > > Thanks for the suggestions and sorry for late response in turn. > > The attached patch tries to fix the issue by explicitly constructing a CTOR > > from svdupq's arguments and then using VEC_PERM_EXPR with VL mask > > having encoded elements {0, 1, ... nargs-1}, > > npatterns == nargs, and nelts_per_pattern == 1, to replicate the base > > vector. > > > > So for example, for the above case, > > svint32_t f_32(int32x4_t x) > > { > > return svdupq_s32 (x[0], x[1], x[2], x[3]); > > } > > > > forwprop1 lowers it to: > > svint32_t _6; > > vector(4) int _8; > > : > > _1 = BIT_FIELD_REF ; > > _2 = BIT_FIELD_REF ; > > _3 = BIT_FIELD_REF ; > > _4 = BIT_FIELD_REF ; > > _8 = {_1, _2, _3, _4}; > > _6 = VEC_PERM_EXPR <_8, _8, { 0, 1, 2, 3, ... }>; > > return _6; > > > > which is then eventually optimized to: > > svint32_t _6; > >[local count: 1073741824]: > > _6 = VEC_PERM_EXPR ; > > return _6; > > > > code-gen: > > f_32: > > dup z0.q, z0.q[0] > > ret > > Nice! > > > Does it look OK ? > > > > Thanks, > > Prathamesh > >> Richard. > >> > >> > We're planning to implement the ACLE's Neon-SVE bridge: > >> > https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge > >> > and so we'll need (b) to implement the svdup_neonq functions. > >> > > >> > Thanks, > >> > Richard > >> > > >> > >> -- > >> Richard Biener > >> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, > >> Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman; > >> HRB 36809 (AG Nuernberg) > > > > [SVE] Fold svld1rq to VEC_PERM_EXPR if elements are not constant. > > > > gcc/ChangeLog: > > * config/aarch64/aarch64-sve-builtins-base.cc > > (svdupq_impl::fold_nonconst_dupq): New method. > > (svdupq_impl::fold): Call fold_nonconst_dupq. > > > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/sve/acle/general/dupq_11.c: New test. > > > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > index cd9cace3c9b..3de79060619 100644 > > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > @@ -817,6 +817,62 @@ public: > > > > class svdupq_impl : public quiet > > { > > +private: > > + gimple * > > + fold_nonconst_dupq (gimple_folder &f, unsigned factor) const > > + { > > +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into: > > + tmp = {arg0, arg1, ..., arg} > > + lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...}) */ > > + > > +/* TODO: Revisit to handle factor by padding zeros. */ > > +if (factor > 1) > > + return NULL; > > Isn't the key thing here predicate vs. vector rather than factor == 1 vs. > factor != 1? Do we generate good code for b8, where factor should be 1? Hi, It generates the following code for svdup_n_b8: https://pastebin.com/ypYt590c I suppose lowering to ctor+vec_perm_expr is not really useful for this case because it won't simplify ctor, unlike the above case of svdupq_s32 (x[0], x[1], x[2], x[3]); However I wonder if it's still a good idea to lower svdupq for predicates, for representing svdupq (or other intrinsics) using GIMPLE constructs as far as possible ? In the attached patch, it simply punts if the type suffix is b, and doesn't try to fold the call. > > > + > > +if (BYTES_BIG_ENDIAN) > > + return NULL; > > + > > +tree lhs = gimple_call_lhs (f.call); > > +if (TREE_CODE (lhs) != SSA_NAME) > > + return NULL; > > Why is this check needed? This was a left-over from something else I was doing wrongly. Sorry I forgot to remove it. > > > +tree lhs_type = TREE_TYPE (lhs); > > +tree elt_type = TREE_TYPE (lhs_type); > > +scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type)); > > Aren't we already dealing with a scalar type here? I'd have expected > SCALAR_TYPE_MODE rather than GET_MODE_INNER (TYPE_MODE ...). Ugh, sorry, I had most of the code copied over from svld1rq_impl for building VEC_PERM_EXPR with VLA mask and adjusted it, but overlooked this :/ > > > +machine_mode vq_mode = aarch64_vq_mode (elt_mode).require (); > > +tree vq_type = build_vector_type_for_mode (elt_type, vq_mode)
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Prathamesh Kulkarni writes: > On Mon, 13 Mar 2023 at 13:03, Richard Biener wrote: >> On GIMPLE it would be >> >> _1 = { a, ... }; // (a) >> _2 = { _1, ... }; // (b) >> >> but I'm not sure if (b), a VL CTOR of fixed len(?) sub-vectors is >> possible? But at least a CTOR of vectors is what we use to >> concat vectors. >> >> With the recent relaxing of VEC_PERM inputs it's also possible to >> express (b) with a VEC_PERM: >> >> _2 = VEC_PERM <_1, _1, { 0, 1, 2, 3, 0, 1, 2, 3, ... }> >> >> but again I'm not sure if that repeating 0, 1, 2, 3 is expressible >> for VL vectors (maybe we'd allow "wrapping" here, I'm not sure). >> > Hi, > Thanks for the suggestions and sorry for late response in turn. > The attached patch tries to fix the issue by explicitly constructing a CTOR > from svdupq's arguments and then using VEC_PERM_EXPR with VL mask > having encoded elements {0, 1, ... nargs-1}, > npatterns == nargs, and nelts_per_pattern == 1, to replicate the base vector. > > So for example, for the above case, > svint32_t f_32(int32x4_t x) > { > return svdupq_s32 (x[0], x[1], x[2], x[3]); > } > > forwprop1 lowers it to: > svint32_t _6; > vector(4) int _8; > : > _1 = BIT_FIELD_REF ; > _2 = BIT_FIELD_REF ; > _3 = BIT_FIELD_REF ; > _4 = BIT_FIELD_REF ; > _8 = {_1, _2, _3, _4}; > _6 = VEC_PERM_EXPR <_8, _8, { 0, 1, 2, 3, ... }>; > return _6; > > which is then eventually optimized to: > svint32_t _6; >[local count: 1073741824]: > _6 = VEC_PERM_EXPR ; > return _6; > > code-gen: > f_32: > dup z0.q, z0.q[0] > ret Nice! > Does it look OK ? > > Thanks, > Prathamesh >> Richard. >> >> > We're planning to implement the ACLE's Neon-SVE bridge: >> > https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge >> > and so we'll need (b) to implement the svdup_neonq functions. >> > >> > Thanks, >> > Richard >> > >> >> -- >> Richard Biener >> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, >> Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman; >> HRB 36809 (AG Nuernberg) > > [SVE] Fold svld1rq to VEC_PERM_EXPR if elements are not constant. > > gcc/ChangeLog: > * config/aarch64/aarch64-sve-builtins-base.cc > (svdupq_impl::fold_nonconst_dupq): New method. > (svdupq_impl::fold): Call fold_nonconst_dupq. > > gcc/testsuite/ChangeLog: > * gcc.target/aarch64/sve/acle/general/dupq_11.c: New test. > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index cd9cace3c9b..3de79060619 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -817,6 +817,62 @@ public: > > class svdupq_impl : public quiet > { > +private: > + gimple * > + fold_nonconst_dupq (gimple_folder &f, unsigned factor) const > + { > +/* Lower lhs = svdupq (arg0, arg1, ..., argN} into: > + tmp = {arg0, arg1, ..., arg} > + lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...}) */ > + > +/* TODO: Revisit to handle factor by padding zeros. */ > +if (factor > 1) > + return NULL; Isn't the key thing here predicate vs. vector rather than factor == 1 vs. factor != 1? Do we generate good code for b8, where factor should be 1? > + > +if (BYTES_BIG_ENDIAN) > + return NULL; > + > +tree lhs = gimple_call_lhs (f.call); > +if (TREE_CODE (lhs) != SSA_NAME) > + return NULL; Why is this check needed? > +tree lhs_type = TREE_TYPE (lhs); > +tree elt_type = TREE_TYPE (lhs_type); > +scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type)); Aren't we already dealing with a scalar type here? I'd have expected SCALAR_TYPE_MODE rather than GET_MODE_INNER (TYPE_MODE ...). > +machine_mode vq_mode = aarch64_vq_mode (elt_mode).require (); > +tree vq_type = build_vector_type_for_mode (elt_type, vq_mode); > + > +unsigned nargs = gimple_call_num_args (f.call); > +vec *v; > +vec_alloc (v, nargs); > +for (unsigned i = 0; i < nargs; i++) > + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i)); > +tree vec = build_constructor (vq_type, v); > + > +tree access_type > + = build_aligned_type (vq_type, TYPE_ALIGN (elt_type)); Nit: seems to fit on one line. But do we need this? We're not accessing memory, so I'd have expected vq_type to be OK as-is. > +tree tmp = make_ssa_name_fn (cfun, access_type, 0); > +gimple *g = gimple_build_assign (tmp, vec); > + > +gimple_seq stmts = NULL; > +gimple_seq_add_stmt_without_update (&stmts, g); > + > +int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant (); Looks like we should be able to use nargs instead of source_nelts. Thanks, Richard > +poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type); > +vec_perm_builder sel (lhs_len, source_nelts, 1); > +for (int i = 0; i < source_nelts; i++) > + sel.quick_push (i); > + > +
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Mon, 13 Mar 2023 at 13:03, Richard Biener wrote: > > On Fri, 10 Mar 2023, Richard Sandiford wrote: > > > Sorry for the slow reply. > > > > Prathamesh Kulkarni writes: > > > Unfortunately it regresses code-gen for the following case: > > > > > > svint32_t f(int32x4_t x) > > > { > > > return svdupq_s32 (x[0], x[1], x[2], x[3]); > > > } > > > > > > -O2 code-gen with trunk: > > > f: > > > dup z0.q, z0.q[0] > > > ret > > > > > > -O2 code-gen with patch: > > > f: > > > dup s1, v0.s[1] > > > movv2.8b, v0.8b > > > ins v1.s[1], v0.s[3] > > > ins v2.s[1], v0.s[2] > > > zip1v0.4s, v2.4s, v1.4s > > > dup z0.q, z0.q[0] > > > ret > > > > > > IIUC, svdupq_impl::expand uses aarch64_expand_vector_init > > > to initialize the "base 128-bit vector" and then use dupq to replicate it. > > > > > > Without patch, aarch64_expand_vector_init generates fallback code, and > > > then > > > combine optimizes a sequence of vec_merge/vec_select pairs into an > > > assignment: > > > > > > (insn 7 3 8 2 (set (reg:SI 99) > > > (vec_select:SI (reg/v:V4SI 97 [ x ]) > > > (parallel [ > > > (const_int 1 [0x1]) > > > ]))) "bar.c":6:10 2592 {aarch64_get_lanev4si} > > > (nil)) > > > > > > (insn 13 9 15 2 (set (reg:V4SI 102) > > > (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 99)) > > > (reg/v:V4SI 97 [ x ]) > > > (const_int 2 [0x2]))) "bar.c":6:10 1794 > > > {aarch64_simd_vec_setv4si} > > > (expr_list:REG_DEAD (reg:SI 99) > > > (expr_list:REG_DEAD (reg/v:V4SI 97 [ x ]) > > > (nil > > > > > > into: > > > Trying 7 -> 13: > > > 7: r99:SI=vec_select(r97:V4SI,parallel) > > >13: r102:V4SI=vec_merge(vec_duplicate(r99:SI),r97:V4SI,0x2) > > > REG_DEAD r99:SI > > > REG_DEAD r97:V4SI > > > Successfully matched this instruction: > > > (set (reg:V4SI 102) > > > (reg/v:V4SI 97 [ x ])) > > > > > > which eventually results into: > > > (note 2 25 3 2 NOTE_INSN_DELETED) > > > (note 3 2 7 2 NOTE_INSN_FUNCTION_BEG) > > > (note 7 3 8 2 NOTE_INSN_DELETED) > > > (note 8 7 9 2 NOTE_INSN_DELETED) > > > (note 9 8 13 2 NOTE_INSN_DELETED) > > > (note 13 9 15 2 NOTE_INSN_DELETED) > > > (note 15 13 17 2 NOTE_INSN_DELETED) > > > (note 17 15 18 2 NOTE_INSN_DELETED) > > > (note 18 17 22 2 NOTE_INSN_DELETED) > > > (insn 22 18 23 2 (parallel [ > > > (set (reg/i:VNx4SI 32 v0) > > > (vec_duplicate:VNx4SI (reg:V4SI 108))) > > > (clobber (scratch:VNx16BI)) > > > ]) "bar.c":7:1 5202 {aarch64_vec_duplicate_vqvnx4si_le} > > > (expr_list:REG_DEAD (reg:V4SI 108) > > > (nil))) > > > (insn 23 22 0 2 (use (reg/i:VNx4SI 32 v0)) "bar.c":7:1 -1 > > > (nil)) > > > > > > I was wondering if we should add the above special case, of assigning > > > target = vec in aarch64_expand_vector_init, if initializer is { > > > vec[0], vec[1], ... } ? > > > > I'm not sure it will be easy to detect that. Won't the inputs to > > aarch64_expand_vector_init just be plain registers? It's not a > > good idea in general to search for definitions of registers > > during expansion. > > > > It would be nice to fix this by lowering svdupq into: > > > > (a) a constructor for a 128-bit vector > > (b) a duplication of the 128-bit vector to fill an SVE vector > > > > But I'm not sure what the best way of doing (b) would be. > > In RTL we can use vec_duplicate, but I don't think gimple > > has an equivalent construct. Maybe Richi has some ideas. > > On GIMPLE it would be > > _1 = { a, ... }; // (a) > _2 = { _1, ... }; // (b) > > but I'm not sure if (b), a VL CTOR of fixed len(?) sub-vectors is > possible? But at least a CTOR of vectors is what we use to > concat vectors. > > With the recent relaxing of VEC_PERM inputs it's also possible to > express (b) with a VEC_PERM: > > _2 = VEC_PERM <_1, _1, { 0, 1, 2, 3, 0, 1, 2, 3, ... }> > > but again I'm not sure if that repeating 0, 1, 2, 3 is expressible > for VL vectors (maybe we'd allow "wrapping" here, I'm not sure). > Hi, Thanks for the suggestions and sorry for late response in turn. The attached patch tries to fix the issue by explicitly constructing a CTOR from svdupq's arguments and then using VEC_PERM_EXPR with VL mask having encoded elements {0, 1, ... nargs-1}, npatterns == nargs, and nelts_per_pattern == 1, to replicate the base vector. So for example, for the above case, svint32_t f_32(int32x4_t x) { return svdupq_s32 (x[0], x[1], x[2], x[3]); } forwprop1 lowers it to: svint32_t _6; vector(4) int _8; : _1 = BIT_FIELD_REF ; _2 = BIT_FIELD_REF ; _3 = BIT_FIELD_REF ; _4 = BIT_FIELD_REF ; _8 = {_1, _2, _3, _4}; _6 = VEC_PERM_EXPR <_8, _8, { 0, 1, 2, 3, ... }>; return _6; which is then eventually optimized to: svint32_t _6; [local count: 1073741824]: _6 = VEC_PERM_EXPR ; return _6; code-gen: f_32: dup z0.q,
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Fri, 10 Mar 2023, Richard Sandiford wrote: > Sorry for the slow reply. > > Prathamesh Kulkarni writes: > > Unfortunately it regresses code-gen for the following case: > > > > svint32_t f(int32x4_t x) > > { > > return svdupq_s32 (x[0], x[1], x[2], x[3]); > > } > > > > -O2 code-gen with trunk: > > f: > > dup z0.q, z0.q[0] > > ret > > > > -O2 code-gen with patch: > > f: > > dup s1, v0.s[1] > > movv2.8b, v0.8b > > ins v1.s[1], v0.s[3] > > ins v2.s[1], v0.s[2] > > zip1v0.4s, v2.4s, v1.4s > > dup z0.q, z0.q[0] > > ret > > > > IIUC, svdupq_impl::expand uses aarch64_expand_vector_init > > to initialize the "base 128-bit vector" and then use dupq to replicate it. > > > > Without patch, aarch64_expand_vector_init generates fallback code, and then > > combine optimizes a sequence of vec_merge/vec_select pairs into an > > assignment: > > > > (insn 7 3 8 2 (set (reg:SI 99) > > (vec_select:SI (reg/v:V4SI 97 [ x ]) > > (parallel [ > > (const_int 1 [0x1]) > > ]))) "bar.c":6:10 2592 {aarch64_get_lanev4si} > > (nil)) > > > > (insn 13 9 15 2 (set (reg:V4SI 102) > > (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 99)) > > (reg/v:V4SI 97 [ x ]) > > (const_int 2 [0x2]))) "bar.c":6:10 1794 > > {aarch64_simd_vec_setv4si} > > (expr_list:REG_DEAD (reg:SI 99) > > (expr_list:REG_DEAD (reg/v:V4SI 97 [ x ]) > > (nil > > > > into: > > Trying 7 -> 13: > > 7: r99:SI=vec_select(r97:V4SI,parallel) > >13: r102:V4SI=vec_merge(vec_duplicate(r99:SI),r97:V4SI,0x2) > > REG_DEAD r99:SI > > REG_DEAD r97:V4SI > > Successfully matched this instruction: > > (set (reg:V4SI 102) > > (reg/v:V4SI 97 [ x ])) > > > > which eventually results into: > > (note 2 25 3 2 NOTE_INSN_DELETED) > > (note 3 2 7 2 NOTE_INSN_FUNCTION_BEG) > > (note 7 3 8 2 NOTE_INSN_DELETED) > > (note 8 7 9 2 NOTE_INSN_DELETED) > > (note 9 8 13 2 NOTE_INSN_DELETED) > > (note 13 9 15 2 NOTE_INSN_DELETED) > > (note 15 13 17 2 NOTE_INSN_DELETED) > > (note 17 15 18 2 NOTE_INSN_DELETED) > > (note 18 17 22 2 NOTE_INSN_DELETED) > > (insn 22 18 23 2 (parallel [ > > (set (reg/i:VNx4SI 32 v0) > > (vec_duplicate:VNx4SI (reg:V4SI 108))) > > (clobber (scratch:VNx16BI)) > > ]) "bar.c":7:1 5202 {aarch64_vec_duplicate_vqvnx4si_le} > > (expr_list:REG_DEAD (reg:V4SI 108) > > (nil))) > > (insn 23 22 0 2 (use (reg/i:VNx4SI 32 v0)) "bar.c":7:1 -1 > > (nil)) > > > > I was wondering if we should add the above special case, of assigning > > target = vec in aarch64_expand_vector_init, if initializer is { > > vec[0], vec[1], ... } ? > > I'm not sure it will be easy to detect that. Won't the inputs to > aarch64_expand_vector_init just be plain registers? It's not a > good idea in general to search for definitions of registers > during expansion. > > It would be nice to fix this by lowering svdupq into: > > (a) a constructor for a 128-bit vector > (b) a duplication of the 128-bit vector to fill an SVE vector > > But I'm not sure what the best way of doing (b) would be. > In RTL we can use vec_duplicate, but I don't think gimple > has an equivalent construct. Maybe Richi has some ideas. On GIMPLE it would be _1 = { a, ... }; // (a) _2 = { _1, ... }; // (b) but I'm not sure if (b), a VL CTOR of fixed len(?) sub-vectors is possible? But at least a CTOR of vectors is what we use to concat vectors. With the recent relaxing of VEC_PERM inputs it's also possible to express (b) with a VEC_PERM: _2 = VEC_PERM <_1, _1, { 0, 1, 2, 3, 0, 1, 2, 3, ... }> but again I'm not sure if that repeating 0, 1, 2, 3 is expressible for VL vectors (maybe we'd allow "wrapping" here, I'm not sure). Richard. > We're planning to implement the ACLE's Neon-SVE bridge: > https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge > and so we'll need (b) to implement the svdup_neonq functions. > > Thanks, > Richard > -- Richard Biener SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman; HRB 36809 (AG Nuernberg)
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Sorry for the slow reply. Prathamesh Kulkarni writes: > Unfortunately it regresses code-gen for the following case: > > svint32_t f(int32x4_t x) > { > return svdupq_s32 (x[0], x[1], x[2], x[3]); > } > > -O2 code-gen with trunk: > f: > dup z0.q, z0.q[0] > ret > > -O2 code-gen with patch: > f: > dup s1, v0.s[1] > movv2.8b, v0.8b > ins v1.s[1], v0.s[3] > ins v2.s[1], v0.s[2] > zip1v0.4s, v2.4s, v1.4s > dup z0.q, z0.q[0] > ret > > IIUC, svdupq_impl::expand uses aarch64_expand_vector_init > to initialize the "base 128-bit vector" and then use dupq to replicate it. > > Without patch, aarch64_expand_vector_init generates fallback code, and then > combine optimizes a sequence of vec_merge/vec_select pairs into an assignment: > > (insn 7 3 8 2 (set (reg:SI 99) > (vec_select:SI (reg/v:V4SI 97 [ x ]) > (parallel [ > (const_int 1 [0x1]) > ]))) "bar.c":6:10 2592 {aarch64_get_lanev4si} > (nil)) > > (insn 13 9 15 2 (set (reg:V4SI 102) > (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 99)) > (reg/v:V4SI 97 [ x ]) > (const_int 2 [0x2]))) "bar.c":6:10 1794 {aarch64_simd_vec_setv4si} > (expr_list:REG_DEAD (reg:SI 99) > (expr_list:REG_DEAD (reg/v:V4SI 97 [ x ]) > (nil > > into: > Trying 7 -> 13: > 7: r99:SI=vec_select(r97:V4SI,parallel) >13: r102:V4SI=vec_merge(vec_duplicate(r99:SI),r97:V4SI,0x2) > REG_DEAD r99:SI > REG_DEAD r97:V4SI > Successfully matched this instruction: > (set (reg:V4SI 102) > (reg/v:V4SI 97 [ x ])) > > which eventually results into: > (note 2 25 3 2 NOTE_INSN_DELETED) > (note 3 2 7 2 NOTE_INSN_FUNCTION_BEG) > (note 7 3 8 2 NOTE_INSN_DELETED) > (note 8 7 9 2 NOTE_INSN_DELETED) > (note 9 8 13 2 NOTE_INSN_DELETED) > (note 13 9 15 2 NOTE_INSN_DELETED) > (note 15 13 17 2 NOTE_INSN_DELETED) > (note 17 15 18 2 NOTE_INSN_DELETED) > (note 18 17 22 2 NOTE_INSN_DELETED) > (insn 22 18 23 2 (parallel [ > (set (reg/i:VNx4SI 32 v0) > (vec_duplicate:VNx4SI (reg:V4SI 108))) > (clobber (scratch:VNx16BI)) > ]) "bar.c":7:1 5202 {aarch64_vec_duplicate_vqvnx4si_le} > (expr_list:REG_DEAD (reg:V4SI 108) > (nil))) > (insn 23 22 0 2 (use (reg/i:VNx4SI 32 v0)) "bar.c":7:1 -1 > (nil)) > > I was wondering if we should add the above special case, of assigning > target = vec in aarch64_expand_vector_init, if initializer is { > vec[0], vec[1], ... } ? I'm not sure it will be easy to detect that. Won't the inputs to aarch64_expand_vector_init just be plain registers? It's not a good idea in general to search for definitions of registers during expansion. It would be nice to fix this by lowering svdupq into: (a) a constructor for a 128-bit vector (b) a duplication of the 128-bit vector to fill an SVE vector But I'm not sure what the best way of doing (b) would be. In RTL we can use vec_duplicate, but I don't think gimple has an equivalent construct. Maybe Richi has some ideas. We're planning to implement the ACLE's Neon-SVE bridge: https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge and so we'll need (b) to implement the svdup_neonq functions. Thanks, Richard
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Mon, 6 Feb 2023 at 17:43, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > On Fri, 3 Feb 2023 at 20:47, Richard Sandiford > > wrote: > >> > >> Prathamesh Kulkarni writes: > >> > On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni > >> > wrote: > >> >> > >> >> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford > >> >> wrote: > >> >> > > >> >> > Prathamesh Kulkarni writes: > >> >> > >> >> > I have attached a patch that extends the transform if one > >> >> > >> >> > half is dup > >> >> > >> >> > and other is set of constants. > >> >> > >> >> > For eg: > >> >> > >> >> > int8x16_t f(int8_t x) > >> >> > >> >> > { > >> >> > >> >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, > >> >> > >> >> > 7, x, 8 }; > >> >> > >> >> > } > >> >> > >> >> > > >> >> > >> >> > code-gen trunk: > >> >> > >> >> > f: > >> >> > >> >> > adrpx1, .LC0 > >> >> > >> >> > ldr q0, [x1, #:lo12:.LC0] > >> >> > >> >> > ins v0.b[0], w0 > >> >> > >> >> > ins v0.b[2], w0 > >> >> > >> >> > ins v0.b[4], w0 > >> >> > >> >> > ins v0.b[6], w0 > >> >> > >> >> > ins v0.b[8], w0 > >> >> > >> >> > ins v0.b[10], w0 > >> >> > >> >> > ins v0.b[12], w0 > >> >> > >> >> > ins v0.b[14], w0 > >> >> > >> >> > ret > >> >> > >> >> > > >> >> > >> >> > code-gen with patch: > >> >> > >> >> > f: > >> >> > >> >> > dup v0.16b, w0 > >> >> > >> >> > adrpx0, .LC0 > >> >> > >> >> > ldr q1, [x0, #:lo12:.LC0] > >> >> > >> >> > zip1v0.16b, v0.16b, v1.16b > >> >> > >> >> > ret > >> >> > >> >> > > >> >> > >> >> > Bootstrapped+tested on aarch64-linux-gnu. > >> >> > >> >> > Does it look OK ? > >> >> > >> >> > >> >> > >> >> Looks like a nice improvement. It'll need to wait for GCC 14 > >> >> > >> >> now though. > >> >> > >> >> > >> >> > >> >> However, rather than handle this case specially, I think we > >> >> > >> >> should instead > >> >> > >> >> take a divide-and-conquer approach: split the initialiser into > >> >> > >> >> even and > >> >> > >> >> odd elements, find the best way of loading each part, then > >> >> > >> >> compare the > >> >> > >> >> cost of these sequences + ZIP with the cost of the fallback > >> >> > >> >> code (the code > >> >> > >> >> later in aarch64_expand_vector_init). > >> >> > >> >> > >> >> > >> >> For example, doing that would allow: > >> >> > >> >> > >> >> > >> >> { x, y, 0, y, 0, y, 0, y, 0, y } > >> >> > >> >> > >> >> > >> >> to be loaded more easily, even though the even elements aren't > >> >> > >> >> wholly > >> >> > >> >> constant. > >> >> > >> > Hi Richard, > >> >> > >> > I have attached a prototype patch based on the above approach. > >> >> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case > >> >> > >> > by generating > >> >> > >> > same sequence, thus I removed that hunk, and improves the > >> >> > >> > following cases: > >> >> > >> > > >> >> > >> > (a) > >> >> > >> > int8x16_t f_s16(int8_t x) > >> >> > >> > { > >> >> > >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, > >> >> > >> > x, 5, x, 6, x, 7, x, 8 }; > >> >> > >> > } > >> >> > >> > > >> >> > >> > code-gen trunk: > >> >> > >> > f_s16: > >> >> > >> > adrpx1, .LC0 > >> >> > >> > ldr q0, [x1, #:lo12:.LC0] > >> >> > >> > ins v0.b[0], w0 > >> >> > >> > ins v0.b[2], w0 > >> >> > >> > ins v0.b[4], w0 > >> >> > >> > ins v0.b[6], w0 > >> >> > >> > ins v0.b[8], w0 > >> >> > >> > ins v0.b[10], w0 > >> >> > >> > ins v0.b[12], w0 > >> >> > >> > ins v0.b[14], w0 > >> >> > >> > ret > >> >> > >> > > >> >> > >> > code-gen with patch: > >> >> > >> > f_s16: > >> >> > >> > dup v0.16b, w0 > >> >> > >> > adrpx0, .LC0 > >> >> > >> > ldr q1, [x0, #:lo12:.LC0] > >> >> > >> > zip1v0.16b, v0.16b, v1.16b > >> >> > >> > ret > >> >> > >> > > >> >> > >> > (b) > >> >> > >> > int8x16_t f_s16(int8_t x, int8_t y) > >> >> > >> > { > >> >> > >> > return (int8x16_t) { x, y, 1, y, 2, y, 3, y, > >> >> > >> > 4, y, 5, y, 6, y, 7, y }; > >> >> > >> > } > >> >> > >> > > >> >> > >> > code-gen trunk: > >> >> > >> > f_s16: > >> >> > >> > adrpx2, .LC0 > >> >> > >> > ldr q0, [x2, #:lo12:.LC0] > >> >> > >> > ins v0.b[0], w0 > >> >> > >> > ins v0.b[1], w1 > >> >> > >> > ins v0.b[3], w1 > >> >> > >> > ins v0.b[5], w1 > >> >> > >> > ins v0.b[7], w1 > >> >> > >> > ins v0.b[9], w1 > >> >> > >> > ins v0.b[11], w1 > >> >> > >> > ins v0.b[13], w1 > >> >> > >> > ins v0.b[15], w1 > >> >> > >> > ret > >> >> > >> > > >> >> > >> > code-gen patch: > >> >> > >> > f_s16: > >> >> > >> > adrpx2, .LC0 > >> >> > >>
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Prathamesh Kulkarni writes: > On Fri, 3 Feb 2023 at 20:47, Richard Sandiford > wrote: >> >> Prathamesh Kulkarni writes: >> > On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni >> > wrote: >> >> >> >> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford >> >> wrote: >> >> > >> >> > Prathamesh Kulkarni writes: >> >> > >> >> > I have attached a patch that extends the transform if one half >> >> > >> >> > is dup >> >> > >> >> > and other is set of constants. >> >> > >> >> > For eg: >> >> > >> >> > int8x16_t f(int8_t x) >> >> > >> >> > { >> >> > >> >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, >> >> > >> >> > 7, x, 8 }; >> >> > >> >> > } >> >> > >> >> > >> >> > >> >> > code-gen trunk: >> >> > >> >> > f: >> >> > >> >> > adrpx1, .LC0 >> >> > >> >> > ldr q0, [x1, #:lo12:.LC0] >> >> > >> >> > ins v0.b[0], w0 >> >> > >> >> > ins v0.b[2], w0 >> >> > >> >> > ins v0.b[4], w0 >> >> > >> >> > ins v0.b[6], w0 >> >> > >> >> > ins v0.b[8], w0 >> >> > >> >> > ins v0.b[10], w0 >> >> > >> >> > ins v0.b[12], w0 >> >> > >> >> > ins v0.b[14], w0 >> >> > >> >> > ret >> >> > >> >> > >> >> > >> >> > code-gen with patch: >> >> > >> >> > f: >> >> > >> >> > dup v0.16b, w0 >> >> > >> >> > adrpx0, .LC0 >> >> > >> >> > ldr q1, [x0, #:lo12:.LC0] >> >> > >> >> > zip1v0.16b, v0.16b, v1.16b >> >> > >> >> > ret >> >> > >> >> > >> >> > >> >> > Bootstrapped+tested on aarch64-linux-gnu. >> >> > >> >> > Does it look OK ? >> >> > >> >> >> >> > >> >> Looks like a nice improvement. It'll need to wait for GCC 14 now >> >> > >> >> though. >> >> > >> >> >> >> > >> >> However, rather than handle this case specially, I think we >> >> > >> >> should instead >> >> > >> >> take a divide-and-conquer approach: split the initialiser into >> >> > >> >> even and >> >> > >> >> odd elements, find the best way of loading each part, then >> >> > >> >> compare the >> >> > >> >> cost of these sequences + ZIP with the cost of the fallback code >> >> > >> >> (the code >> >> > >> >> later in aarch64_expand_vector_init). >> >> > >> >> >> >> > >> >> For example, doing that would allow: >> >> > >> >> >> >> > >> >> { x, y, 0, y, 0, y, 0, y, 0, y } >> >> > >> >> >> >> > >> >> to be loaded more easily, even though the even elements aren't >> >> > >> >> wholly >> >> > >> >> constant. >> >> > >> > Hi Richard, >> >> > >> > I have attached a prototype patch based on the above approach. >> >> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case >> >> > >> > by generating >> >> > >> > same sequence, thus I removed that hunk, and improves the >> >> > >> > following cases: >> >> > >> > >> >> > >> > (a) >> >> > >> > int8x16_t f_s16(int8_t x) >> >> > >> > { >> >> > >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, >> >> > >> > x, 5, x, 6, x, 7, x, 8 }; >> >> > >> > } >> >> > >> > >> >> > >> > code-gen trunk: >> >> > >> > f_s16: >> >> > >> > adrpx1, .LC0 >> >> > >> > ldr q0, [x1, #:lo12:.LC0] >> >> > >> > ins v0.b[0], w0 >> >> > >> > ins v0.b[2], w0 >> >> > >> > ins v0.b[4], w0 >> >> > >> > ins v0.b[6], w0 >> >> > >> > ins v0.b[8], w0 >> >> > >> > ins v0.b[10], w0 >> >> > >> > ins v0.b[12], w0 >> >> > >> > ins v0.b[14], w0 >> >> > >> > ret >> >> > >> > >> >> > >> > code-gen with patch: >> >> > >> > f_s16: >> >> > >> > dup v0.16b, w0 >> >> > >> > adrpx0, .LC0 >> >> > >> > ldr q1, [x0, #:lo12:.LC0] >> >> > >> > zip1v0.16b, v0.16b, v1.16b >> >> > >> > ret >> >> > >> > >> >> > >> > (b) >> >> > >> > int8x16_t f_s16(int8_t x, int8_t y) >> >> > >> > { >> >> > >> > return (int8x16_t) { x, y, 1, y, 2, y, 3, y, >> >> > >> > 4, y, 5, y, 6, y, 7, y }; >> >> > >> > } >> >> > >> > >> >> > >> > code-gen trunk: >> >> > >> > f_s16: >> >> > >> > adrpx2, .LC0 >> >> > >> > ldr q0, [x2, #:lo12:.LC0] >> >> > >> > ins v0.b[0], w0 >> >> > >> > ins v0.b[1], w1 >> >> > >> > ins v0.b[3], w1 >> >> > >> > ins v0.b[5], w1 >> >> > >> > ins v0.b[7], w1 >> >> > >> > ins v0.b[9], w1 >> >> > >> > ins v0.b[11], w1 >> >> > >> > ins v0.b[13], w1 >> >> > >> > ins v0.b[15], w1 >> >> > >> > ret >> >> > >> > >> >> > >> > code-gen patch: >> >> > >> > f_s16: >> >> > >> > adrpx2, .LC0 >> >> > >> > dup v1.16b, w1 >> >> > >> > ldr q0, [x2, #:lo12:.LC0] >> >> > >> > ins v0.b[0], w0 >> >> > >> > zip1v0.16b, v0.16b, v1.16b >> >> > >> > ret >> >> > >> >> >> > >> Nice. >> >> > >> >> >> > >> > There are a couple of issues I have come across: >> >> > >> > (1)
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Fri, 3 Feb 2023 at 20:47, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni > > wrote: > >> > >> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford > >> wrote: > >> > > >> > Prathamesh Kulkarni writes: > >> > >> >> > I have attached a patch that extends the transform if one half > >> > >> >> > is dup > >> > >> >> > and other is set of constants. > >> > >> >> > For eg: > >> > >> >> > int8x16_t f(int8_t x) > >> > >> >> > { > >> > >> >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, > >> > >> >> > x, 8 }; > >> > >> >> > } > >> > >> >> > > >> > >> >> > code-gen trunk: > >> > >> >> > f: > >> > >> >> > adrpx1, .LC0 > >> > >> >> > ldr q0, [x1, #:lo12:.LC0] > >> > >> >> > ins v0.b[0], w0 > >> > >> >> > ins v0.b[2], w0 > >> > >> >> > ins v0.b[4], w0 > >> > >> >> > ins v0.b[6], w0 > >> > >> >> > ins v0.b[8], w0 > >> > >> >> > ins v0.b[10], w0 > >> > >> >> > ins v0.b[12], w0 > >> > >> >> > ins v0.b[14], w0 > >> > >> >> > ret > >> > >> >> > > >> > >> >> > code-gen with patch: > >> > >> >> > f: > >> > >> >> > dup v0.16b, w0 > >> > >> >> > adrpx0, .LC0 > >> > >> >> > ldr q1, [x0, #:lo12:.LC0] > >> > >> >> > zip1v0.16b, v0.16b, v1.16b > >> > >> >> > ret > >> > >> >> > > >> > >> >> > Bootstrapped+tested on aarch64-linux-gnu. > >> > >> >> > Does it look OK ? > >> > >> >> > >> > >> >> Looks like a nice improvement. It'll need to wait for GCC 14 now > >> > >> >> though. > >> > >> >> > >> > >> >> However, rather than handle this case specially, I think we should > >> > >> >> instead > >> > >> >> take a divide-and-conquer approach: split the initialiser into > >> > >> >> even and > >> > >> >> odd elements, find the best way of loading each part, then compare > >> > >> >> the > >> > >> >> cost of these sequences + ZIP with the cost of the fallback code > >> > >> >> (the code > >> > >> >> later in aarch64_expand_vector_init). > >> > >> >> > >> > >> >> For example, doing that would allow: > >> > >> >> > >> > >> >> { x, y, 0, y, 0, y, 0, y, 0, y } > >> > >> >> > >> > >> >> to be loaded more easily, even though the even elements aren't > >> > >> >> wholly > >> > >> >> constant. > >> > >> > Hi Richard, > >> > >> > I have attached a prototype patch based on the above approach. > >> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by > >> > >> > generating > >> > >> > same sequence, thus I removed that hunk, and improves the following > >> > >> > cases: > >> > >> > > >> > >> > (a) > >> > >> > int8x16_t f_s16(int8_t x) > >> > >> > { > >> > >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, > >> > >> > x, 5, x, 6, x, 7, x, 8 }; > >> > >> > } > >> > >> > > >> > >> > code-gen trunk: > >> > >> > f_s16: > >> > >> > adrpx1, .LC0 > >> > >> > ldr q0, [x1, #:lo12:.LC0] > >> > >> > ins v0.b[0], w0 > >> > >> > ins v0.b[2], w0 > >> > >> > ins v0.b[4], w0 > >> > >> > ins v0.b[6], w0 > >> > >> > ins v0.b[8], w0 > >> > >> > ins v0.b[10], w0 > >> > >> > ins v0.b[12], w0 > >> > >> > ins v0.b[14], w0 > >> > >> > ret > >> > >> > > >> > >> > code-gen with patch: > >> > >> > f_s16: > >> > >> > dup v0.16b, w0 > >> > >> > adrpx0, .LC0 > >> > >> > ldr q1, [x0, #:lo12:.LC0] > >> > >> > zip1v0.16b, v0.16b, v1.16b > >> > >> > ret > >> > >> > > >> > >> > (b) > >> > >> > int8x16_t f_s16(int8_t x, int8_t y) > >> > >> > { > >> > >> > return (int8x16_t) { x, y, 1, y, 2, y, 3, y, > >> > >> > 4, y, 5, y, 6, y, 7, y }; > >> > >> > } > >> > >> > > >> > >> > code-gen trunk: > >> > >> > f_s16: > >> > >> > adrpx2, .LC0 > >> > >> > ldr q0, [x2, #:lo12:.LC0] > >> > >> > ins v0.b[0], w0 > >> > >> > ins v0.b[1], w1 > >> > >> > ins v0.b[3], w1 > >> > >> > ins v0.b[5], w1 > >> > >> > ins v0.b[7], w1 > >> > >> > ins v0.b[9], w1 > >> > >> > ins v0.b[11], w1 > >> > >> > ins v0.b[13], w1 > >> > >> > ins v0.b[15], w1 > >> > >> > ret > >> > >> > > >> > >> > code-gen patch: > >> > >> > f_s16: > >> > >> > adrpx2, .LC0 > >> > >> > dup v1.16b, w1 > >> > >> > ldr q0, [x2, #:lo12:.LC0] > >> > >> > ins v0.b[0], w0 > >> > >> > zip1v0.16b, v0.16b, v1.16b > >> > >> > ret > >> > >> > >> > >> Nice. > >> > >> > >> > >> > There are a couple of issues I have come across: > >> > >> > (1) Choosing element to pad vector. > >> > >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, > >> > >> > y } > >> > >> > with mode V8HI. > >> > >>
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Prathamesh Kulkarni writes: > On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni > wrote: >> >> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford >> wrote: >> > >> > Prathamesh Kulkarni writes: >> > >> >> > I have attached a patch that extends the transform if one half is >> > >> >> > dup >> > >> >> > and other is set of constants. >> > >> >> > For eg: >> > >> >> > int8x16_t f(int8_t x) >> > >> >> > { >> > >> >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, >> > >> >> > x, 8 }; >> > >> >> > } >> > >> >> > >> > >> >> > code-gen trunk: >> > >> >> > f: >> > >> >> > adrpx1, .LC0 >> > >> >> > ldr q0, [x1, #:lo12:.LC0] >> > >> >> > ins v0.b[0], w0 >> > >> >> > ins v0.b[2], w0 >> > >> >> > ins v0.b[4], w0 >> > >> >> > ins v0.b[6], w0 >> > >> >> > ins v0.b[8], w0 >> > >> >> > ins v0.b[10], w0 >> > >> >> > ins v0.b[12], w0 >> > >> >> > ins v0.b[14], w0 >> > >> >> > ret >> > >> >> > >> > >> >> > code-gen with patch: >> > >> >> > f: >> > >> >> > dup v0.16b, w0 >> > >> >> > adrpx0, .LC0 >> > >> >> > ldr q1, [x0, #:lo12:.LC0] >> > >> >> > zip1v0.16b, v0.16b, v1.16b >> > >> >> > ret >> > >> >> > >> > >> >> > Bootstrapped+tested on aarch64-linux-gnu. >> > >> >> > Does it look OK ? >> > >> >> >> > >> >> Looks like a nice improvement. It'll need to wait for GCC 14 now >> > >> >> though. >> > >> >> >> > >> >> However, rather than handle this case specially, I think we should >> > >> >> instead >> > >> >> take a divide-and-conquer approach: split the initialiser into even >> > >> >> and >> > >> >> odd elements, find the best way of loading each part, then compare >> > >> >> the >> > >> >> cost of these sequences + ZIP with the cost of the fallback code >> > >> >> (the code >> > >> >> later in aarch64_expand_vector_init). >> > >> >> >> > >> >> For example, doing that would allow: >> > >> >> >> > >> >> { x, y, 0, y, 0, y, 0, y, 0, y } >> > >> >> >> > >> >> to be loaded more easily, even though the even elements aren't wholly >> > >> >> constant. >> > >> > Hi Richard, >> > >> > I have attached a prototype patch based on the above approach. >> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by >> > >> > generating >> > >> > same sequence, thus I removed that hunk, and improves the following >> > >> > cases: >> > >> > >> > >> > (a) >> > >> > int8x16_t f_s16(int8_t x) >> > >> > { >> > >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, >> > >> > x, 5, x, 6, x, 7, x, 8 }; >> > >> > } >> > >> > >> > >> > code-gen trunk: >> > >> > f_s16: >> > >> > adrpx1, .LC0 >> > >> > ldr q0, [x1, #:lo12:.LC0] >> > >> > ins v0.b[0], w0 >> > >> > ins v0.b[2], w0 >> > >> > ins v0.b[4], w0 >> > >> > ins v0.b[6], w0 >> > >> > ins v0.b[8], w0 >> > >> > ins v0.b[10], w0 >> > >> > ins v0.b[12], w0 >> > >> > ins v0.b[14], w0 >> > >> > ret >> > >> > >> > >> > code-gen with patch: >> > >> > f_s16: >> > >> > dup v0.16b, w0 >> > >> > adrpx0, .LC0 >> > >> > ldr q1, [x0, #:lo12:.LC0] >> > >> > zip1v0.16b, v0.16b, v1.16b >> > >> > ret >> > >> > >> > >> > (b) >> > >> > int8x16_t f_s16(int8_t x, int8_t y) >> > >> > { >> > >> > return (int8x16_t) { x, y, 1, y, 2, y, 3, y, >> > >> > 4, y, 5, y, 6, y, 7, y }; >> > >> > } >> > >> > >> > >> > code-gen trunk: >> > >> > f_s16: >> > >> > adrpx2, .LC0 >> > >> > ldr q0, [x2, #:lo12:.LC0] >> > >> > ins v0.b[0], w0 >> > >> > ins v0.b[1], w1 >> > >> > ins v0.b[3], w1 >> > >> > ins v0.b[5], w1 >> > >> > ins v0.b[7], w1 >> > >> > ins v0.b[9], w1 >> > >> > ins v0.b[11], w1 >> > >> > ins v0.b[13], w1 >> > >> > ins v0.b[15], w1 >> > >> > ret >> > >> > >> > >> > code-gen patch: >> > >> > f_s16: >> > >> > adrpx2, .LC0 >> > >> > dup v1.16b, w1 >> > >> > ldr q0, [x2, #:lo12:.LC0] >> > >> > ins v0.b[0], w0 >> > >> > zip1v0.16b, v0.16b, v1.16b >> > >> > ret >> > >> >> > >> Nice. >> > >> >> > >> > There are a couple of issues I have come across: >> > >> > (1) Choosing element to pad vector. >> > >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y >> > >> > } >> > >> > with mode V8HI. >> > >> > We split it into { x, 0, 1, 2 } and { y, y, y, y} >> > >> > However since the mode is V8HI, we would need to pad the above split >> > >> > vectors >> > >> > with 4 more elements to match up to vector length. >> > >> > For {x, 0, 1, 2} using any constant is the obvious choice while for >> > >> > {y, y, y, y} >> > >> > using 'y' is the obvio
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni wrote: > > On Thu, 2 Feb 2023 at 20:50, Richard Sandiford > wrote: > > > > Prathamesh Kulkarni writes: > > >> >> > I have attached a patch that extends the transform if one half is > > >> >> > dup > > >> >> > and other is set of constants. > > >> >> > For eg: > > >> >> > int8x16_t f(int8_t x) > > >> >> > { > > >> >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, > > >> >> > 8 }; > > >> >> > } > > >> >> > > > >> >> > code-gen trunk: > > >> >> > f: > > >> >> > adrpx1, .LC0 > > >> >> > ldr q0, [x1, #:lo12:.LC0] > > >> >> > ins v0.b[0], w0 > > >> >> > ins v0.b[2], w0 > > >> >> > ins v0.b[4], w0 > > >> >> > ins v0.b[6], w0 > > >> >> > ins v0.b[8], w0 > > >> >> > ins v0.b[10], w0 > > >> >> > ins v0.b[12], w0 > > >> >> > ins v0.b[14], w0 > > >> >> > ret > > >> >> > > > >> >> > code-gen with patch: > > >> >> > f: > > >> >> > dup v0.16b, w0 > > >> >> > adrpx0, .LC0 > > >> >> > ldr q1, [x0, #:lo12:.LC0] > > >> >> > zip1v0.16b, v0.16b, v1.16b > > >> >> > ret > > >> >> > > > >> >> > Bootstrapped+tested on aarch64-linux-gnu. > > >> >> > Does it look OK ? > > >> >> > > >> >> Looks like a nice improvement. It'll need to wait for GCC 14 now > > >> >> though. > > >> >> > > >> >> However, rather than handle this case specially, I think we should > > >> >> instead > > >> >> take a divide-and-conquer approach: split the initialiser into even > > >> >> and > > >> >> odd elements, find the best way of loading each part, then compare the > > >> >> cost of these sequences + ZIP with the cost of the fallback code (the > > >> >> code > > >> >> later in aarch64_expand_vector_init). > > >> >> > > >> >> For example, doing that would allow: > > >> >> > > >> >> { x, y, 0, y, 0, y, 0, y, 0, y } > > >> >> > > >> >> to be loaded more easily, even though the even elements aren't wholly > > >> >> constant. > > >> > Hi Richard, > > >> > I have attached a prototype patch based on the above approach. > > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by > > >> > generating > > >> > same sequence, thus I removed that hunk, and improves the following > > >> > cases: > > >> > > > >> > (a) > > >> > int8x16_t f_s16(int8_t x) > > >> > { > > >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, > > >> > x, 5, x, 6, x, 7, x, 8 }; > > >> > } > > >> > > > >> > code-gen trunk: > > >> > f_s16: > > >> > adrpx1, .LC0 > > >> > ldr q0, [x1, #:lo12:.LC0] > > >> > ins v0.b[0], w0 > > >> > ins v0.b[2], w0 > > >> > ins v0.b[4], w0 > > >> > ins v0.b[6], w0 > > >> > ins v0.b[8], w0 > > >> > ins v0.b[10], w0 > > >> > ins v0.b[12], w0 > > >> > ins v0.b[14], w0 > > >> > ret > > >> > > > >> > code-gen with patch: > > >> > f_s16: > > >> > dup v0.16b, w0 > > >> > adrpx0, .LC0 > > >> > ldr q1, [x0, #:lo12:.LC0] > > >> > zip1v0.16b, v0.16b, v1.16b > > >> > ret > > >> > > > >> > (b) > > >> > int8x16_t f_s16(int8_t x, int8_t y) > > >> > { > > >> > return (int8x16_t) { x, y, 1, y, 2, y, 3, y, > > >> > 4, y, 5, y, 6, y, 7, y }; > > >> > } > > >> > > > >> > code-gen trunk: > > >> > f_s16: > > >> > adrpx2, .LC0 > > >> > ldr q0, [x2, #:lo12:.LC0] > > >> > ins v0.b[0], w0 > > >> > ins v0.b[1], w1 > > >> > ins v0.b[3], w1 > > >> > ins v0.b[5], w1 > > >> > ins v0.b[7], w1 > > >> > ins v0.b[9], w1 > > >> > ins v0.b[11], w1 > > >> > ins v0.b[13], w1 > > >> > ins v0.b[15], w1 > > >> > ret > > >> > > > >> > code-gen patch: > > >> > f_s16: > > >> > adrpx2, .LC0 > > >> > dup v1.16b, w1 > > >> > ldr q0, [x2, #:lo12:.LC0] > > >> > ins v0.b[0], w0 > > >> > zip1v0.16b, v0.16b, v1.16b > > >> > ret > > >> > > >> Nice. > > >> > > >> > There are a couple of issues I have come across: > > >> > (1) Choosing element to pad vector. > > >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y } > > >> > with mode V8HI. > > >> > We split it into { x, 0, 1, 2 } and { y, y, y, y} > > >> > However since the mode is V8HI, we would need to pad the above split > > >> > vectors > > >> > with 4 more elements to match up to vector length. > > >> > For {x, 0, 1, 2} using any constant is the obvious choice while for > > >> > {y, y, y, y} > > >> > using 'y' is the obvious choice thus making them: > > >> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y} > > >> > These would be then merged using zip1 which would discard the lower > > >> > half > > >> > of
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Thu, 2 Feb 2023 at 20:50, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > >> >> > I have attached a patch that extends the transform if one half is dup > >> >> > and other is set of constants. > >> >> > For eg: > >> >> > int8x16_t f(int8_t x) > >> >> > { > >> >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 > >> >> > }; > >> >> > } > >> >> > > >> >> > code-gen trunk: > >> >> > f: > >> >> > adrpx1, .LC0 > >> >> > ldr q0, [x1, #:lo12:.LC0] > >> >> > ins v0.b[0], w0 > >> >> > ins v0.b[2], w0 > >> >> > ins v0.b[4], w0 > >> >> > ins v0.b[6], w0 > >> >> > ins v0.b[8], w0 > >> >> > ins v0.b[10], w0 > >> >> > ins v0.b[12], w0 > >> >> > ins v0.b[14], w0 > >> >> > ret > >> >> > > >> >> > code-gen with patch: > >> >> > f: > >> >> > dup v0.16b, w0 > >> >> > adrpx0, .LC0 > >> >> > ldr q1, [x0, #:lo12:.LC0] > >> >> > zip1v0.16b, v0.16b, v1.16b > >> >> > ret > >> >> > > >> >> > Bootstrapped+tested on aarch64-linux-gnu. > >> >> > Does it look OK ? > >> >> > >> >> Looks like a nice improvement. It'll need to wait for GCC 14 now > >> >> though. > >> >> > >> >> However, rather than handle this case specially, I think we should > >> >> instead > >> >> take a divide-and-conquer approach: split the initialiser into even and > >> >> odd elements, find the best way of loading each part, then compare the > >> >> cost of these sequences + ZIP with the cost of the fallback code (the > >> >> code > >> >> later in aarch64_expand_vector_init). > >> >> > >> >> For example, doing that would allow: > >> >> > >> >> { x, y, 0, y, 0, y, 0, y, 0, y } > >> >> > >> >> to be loaded more easily, even though the even elements aren't wholly > >> >> constant. > >> > Hi Richard, > >> > I have attached a prototype patch based on the above approach. > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by > >> > generating > >> > same sequence, thus I removed that hunk, and improves the following > >> > cases: > >> > > >> > (a) > >> > int8x16_t f_s16(int8_t x) > >> > { > >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, > >> > x, 5, x, 6, x, 7, x, 8 }; > >> > } > >> > > >> > code-gen trunk: > >> > f_s16: > >> > adrpx1, .LC0 > >> > ldr q0, [x1, #:lo12:.LC0] > >> > ins v0.b[0], w0 > >> > ins v0.b[2], w0 > >> > ins v0.b[4], w0 > >> > ins v0.b[6], w0 > >> > ins v0.b[8], w0 > >> > ins v0.b[10], w0 > >> > ins v0.b[12], w0 > >> > ins v0.b[14], w0 > >> > ret > >> > > >> > code-gen with patch: > >> > f_s16: > >> > dup v0.16b, w0 > >> > adrpx0, .LC0 > >> > ldr q1, [x0, #:lo12:.LC0] > >> > zip1v0.16b, v0.16b, v1.16b > >> > ret > >> > > >> > (b) > >> > int8x16_t f_s16(int8_t x, int8_t y) > >> > { > >> > return (int8x16_t) { x, y, 1, y, 2, y, 3, y, > >> > 4, y, 5, y, 6, y, 7, y }; > >> > } > >> > > >> > code-gen trunk: > >> > f_s16: > >> > adrpx2, .LC0 > >> > ldr q0, [x2, #:lo12:.LC0] > >> > ins v0.b[0], w0 > >> > ins v0.b[1], w1 > >> > ins v0.b[3], w1 > >> > ins v0.b[5], w1 > >> > ins v0.b[7], w1 > >> > ins v0.b[9], w1 > >> > ins v0.b[11], w1 > >> > ins v0.b[13], w1 > >> > ins v0.b[15], w1 > >> > ret > >> > > >> > code-gen patch: > >> > f_s16: > >> > adrpx2, .LC0 > >> > dup v1.16b, w1 > >> > ldr q0, [x2, #:lo12:.LC0] > >> > ins v0.b[0], w0 > >> > zip1v0.16b, v0.16b, v1.16b > >> > ret > >> > >> Nice. > >> > >> > There are a couple of issues I have come across: > >> > (1) Choosing element to pad vector. > >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y } > >> > with mode V8HI. > >> > We split it into { x, 0, 1, 2 } and { y, y, y, y} > >> > However since the mode is V8HI, we would need to pad the above split > >> > vectors > >> > with 4 more elements to match up to vector length. > >> > For {x, 0, 1, 2} using any constant is the obvious choice while for {y, > >> > y, y, y} > >> > using 'y' is the obvious choice thus making them: > >> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y} > >> > These would be then merged using zip1 which would discard the lower half > >> > of both vectors. > >> > Currently I encoded the above two heuristics in > >> > aarch64_expand_vector_init_get_padded_elem: > >> > (a) If split portion contains a constant, use the constant to pad the > >> > vector. > >> > (b) If split portion only contains variables, then use the most > >> > frequently repeating variable > >> > to pad the vector. > >> > I suppose tho
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Prathamesh Kulkarni writes: >> >> > I have attached a patch that extends the transform if one half is dup >> >> > and other is set of constants. >> >> > For eg: >> >> > int8x16_t f(int8_t x) >> >> > { >> >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 }; >> >> > } >> >> > >> >> > code-gen trunk: >> >> > f: >> >> > adrpx1, .LC0 >> >> > ldr q0, [x1, #:lo12:.LC0] >> >> > ins v0.b[0], w0 >> >> > ins v0.b[2], w0 >> >> > ins v0.b[4], w0 >> >> > ins v0.b[6], w0 >> >> > ins v0.b[8], w0 >> >> > ins v0.b[10], w0 >> >> > ins v0.b[12], w0 >> >> > ins v0.b[14], w0 >> >> > ret >> >> > >> >> > code-gen with patch: >> >> > f: >> >> > dup v0.16b, w0 >> >> > adrpx0, .LC0 >> >> > ldr q1, [x0, #:lo12:.LC0] >> >> > zip1v0.16b, v0.16b, v1.16b >> >> > ret >> >> > >> >> > Bootstrapped+tested on aarch64-linux-gnu. >> >> > Does it look OK ? >> >> >> >> Looks like a nice improvement. It'll need to wait for GCC 14 now though. >> >> >> >> However, rather than handle this case specially, I think we should instead >> >> take a divide-and-conquer approach: split the initialiser into even and >> >> odd elements, find the best way of loading each part, then compare the >> >> cost of these sequences + ZIP with the cost of the fallback code (the code >> >> later in aarch64_expand_vector_init). >> >> >> >> For example, doing that would allow: >> >> >> >> { x, y, 0, y, 0, y, 0, y, 0, y } >> >> >> >> to be loaded more easily, even though the even elements aren't wholly >> >> constant. >> > Hi Richard, >> > I have attached a prototype patch based on the above approach. >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by >> > generating >> > same sequence, thus I removed that hunk, and improves the following cases: >> > >> > (a) >> > int8x16_t f_s16(int8_t x) >> > { >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, >> > x, 5, x, 6, x, 7, x, 8 }; >> > } >> > >> > code-gen trunk: >> > f_s16: >> > adrpx1, .LC0 >> > ldr q0, [x1, #:lo12:.LC0] >> > ins v0.b[0], w0 >> > ins v0.b[2], w0 >> > ins v0.b[4], w0 >> > ins v0.b[6], w0 >> > ins v0.b[8], w0 >> > ins v0.b[10], w0 >> > ins v0.b[12], w0 >> > ins v0.b[14], w0 >> > ret >> > >> > code-gen with patch: >> > f_s16: >> > dup v0.16b, w0 >> > adrpx0, .LC0 >> > ldr q1, [x0, #:lo12:.LC0] >> > zip1v0.16b, v0.16b, v1.16b >> > ret >> > >> > (b) >> > int8x16_t f_s16(int8_t x, int8_t y) >> > { >> > return (int8x16_t) { x, y, 1, y, 2, y, 3, y, >> > 4, y, 5, y, 6, y, 7, y }; >> > } >> > >> > code-gen trunk: >> > f_s16: >> > adrpx2, .LC0 >> > ldr q0, [x2, #:lo12:.LC0] >> > ins v0.b[0], w0 >> > ins v0.b[1], w1 >> > ins v0.b[3], w1 >> > ins v0.b[5], w1 >> > ins v0.b[7], w1 >> > ins v0.b[9], w1 >> > ins v0.b[11], w1 >> > ins v0.b[13], w1 >> > ins v0.b[15], w1 >> > ret >> > >> > code-gen patch: >> > f_s16: >> > adrpx2, .LC0 >> > dup v1.16b, w1 >> > ldr q0, [x2, #:lo12:.LC0] >> > ins v0.b[0], w0 >> > zip1v0.16b, v0.16b, v1.16b >> > ret >> >> Nice. >> >> > There are a couple of issues I have come across: >> > (1) Choosing element to pad vector. >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y } >> > with mode V8HI. >> > We split it into { x, 0, 1, 2 } and { y, y, y, y} >> > However since the mode is V8HI, we would need to pad the above split >> > vectors >> > with 4 more elements to match up to vector length. >> > For {x, 0, 1, 2} using any constant is the obvious choice while for {y, y, >> > y, y} >> > using 'y' is the obvious choice thus making them: >> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y} >> > These would be then merged using zip1 which would discard the lower half >> > of both vectors. >> > Currently I encoded the above two heuristics in >> > aarch64_expand_vector_init_get_padded_elem: >> > (a) If split portion contains a constant, use the constant to pad the >> > vector. >> > (b) If split portion only contains variables, then use the most >> > frequently repeating variable >> > to pad the vector. >> > I suppose tho this could be improved ? >> >> I think we should just build two 64-bit vectors (V4HIs) and use a subreg >> to fill the upper elements with undefined values. >> >> I suppose in principle we would have the same problem when splitting >> a 64-bit vector into 2 32-bit vectors, but it's probably better to punt >> on that for now. Eventually it would be worth adding full support fo
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Wed, 1 Feb 2023 at 21:56, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > On Thu, 12 Jan 2023 at 21:21, Richard Sandiford > > wrote: > >> > >> Prathamesh Kulkarni writes: > >> > On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni > >> > wrote: > >> >> > >> >> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford > >> >> wrote: > >> >> > > >> >> > Richard Sandiford via Gcc-patches writes: > >> >> > > Prathamesh Kulkarni writes: > >> >> > >> Hi, > >> >> > >> For the following test-case: > >> >> > >> > >> >> > >> int16x8_t foo(int16_t x, int16_t y) > >> >> > >> { > >> >> > >> return (int16x8_t) { x, y, x, y, x, y, x, y }; > >> >> > >> } > >> >> > >> > >> >> > >> Code gen at -O3: > >> >> > >> foo: > >> >> > >> dupv0.8h, w0 > >> >> > >> ins v0.h[1], w1 > >> >> > >> ins v0.h[3], w1 > >> >> > >> ins v0.h[5], w1 > >> >> > >> ins v0.h[7], w1 > >> >> > >> ret > >> >> > >> > >> >> > >> For 16 elements, it results in 8 ins instructions which might not > >> >> > >> be > >> >> > >> optimal perhaps. > >> >> > >> I guess, the above code-gen would be equivalent to the following ? > >> >> > >> dup v0.8h, w0 > >> >> > >> dup v1.8h, w1 > >> >> > >> zip1 v0.8h, v0.8h, v1.8h > >> >> > >> > >> >> > >> I have attached patch to do the same, if number of elements >= 8, > >> >> > >> which should be possibly better compared to current code-gen ? > >> >> > >> Patch passes bootstrap+test on aarch64-linux-gnu. > >> >> > >> Does the patch look OK ? > >> >> > >> > >> >> > >> Thanks, > >> >> > >> Prathamesh > >> >> > >> > >> >> > >> diff --git a/gcc/config/aarch64/aarch64.cc > >> >> > >> b/gcc/config/aarch64/aarch64.cc > >> >> > >> index c91df6f5006..e5dea70e363 100644 > >> >> > >> --- a/gcc/config/aarch64/aarch64.cc > >> >> > >> +++ b/gcc/config/aarch64/aarch64.cc > >> >> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, > >> >> > >> rtx vals) > >> >> > >>return; > >> >> > >> } > >> >> > >> > >> >> > >> + /* Check for interleaving case. > >> >> > >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, > >> >> > >> y}. > >> >> > >> + Generate following code: > >> >> > >> + dup v0.h, x > >> >> > >> + dup v1.h, y > >> >> > >> + zip1 v0.h, v0.h, v1.h > >> >> > >> + for "large enough" initializer. */ > >> >> > >> + > >> >> > >> + if (n_elts >= 8) > >> >> > >> +{ > >> >> > >> + int i; > >> >> > >> + for (i = 2; i < n_elts; i++) > >> >> > >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % > >> >> > >> 2))) > >> >> > >> + break; > >> >> > >> + > >> >> > >> + if (i == n_elts) > >> >> > >> +{ > >> >> > >> + machine_mode mode = GET_MODE (target); > >> >> > >> + rtx dest[2]; > >> >> > >> + > >> >> > >> + for (int i = 0; i < 2; i++) > >> >> > >> +{ > >> >> > >> + rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), > >> >> > >> XVECEXP (vals, 0, i)); > >> >> > > > >> >> > > Formatting nit: long line. > >> >> > > > >> >> > >> + dest[i] = gen_reg_rtx (mode); > >> >> > >> + aarch64_emit_move (dest[i], gen_vec_duplicate (mode, > >> >> > >> x)); > >> >> > >> +} > >> >> > > > >> >> > > This could probably be written: > >> >> > > > >> >> > > for (int i = 0; i < 2; i++) > >> >> > > { > >> >> > > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, > >> >> > > 0, i)); > >> >> > > dest[i] = force_reg (GET_MODE_INNER (mode), x); > >> >> > > >> >> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry. > >> >> Thanks, I have pushed the change in > >> >> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running > >> >> bootstrap+test on aarch64-linux-gnu. > >> > Hi Richard, > >> > I have attached a patch that extends the transform if one half is dup > >> > and other is set of constants. > >> > For eg: > >> > int8x16_t f(int8_t x) > >> > { > >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 }; > >> > } > >> > > >> > code-gen trunk: > >> > f: > >> > adrpx1, .LC0 > >> > ldr q0, [x1, #:lo12:.LC0] > >> > ins v0.b[0], w0 > >> > ins v0.b[2], w0 > >> > ins v0.b[4], w0 > >> > ins v0.b[6], w0 > >> > ins v0.b[8], w0 > >> > ins v0.b[10], w0 > >> > ins v0.b[12], w0 > >> > ins v0.b[14], w0 > >> > ret > >> > > >> > code-gen with patch: > >> > f: > >> > dup v0.16b, w0 > >> > adrpx0, .LC0 > >> > ldr q1, [x0, #:lo12:.LC0] > >> > zip1v0.16b, v0.16b, v1.16b > >> > ret > >> > > >> > Bootstrapped+tested on aarch64-linux-gnu. > >> > Does it look OK ? > >> > >> Looks like a nice improvement. It'll need to wait for GCC 14 now though. > >> > >> However, rather than handle this case specially, I think we should instead > >> take a divide-and-conquer approach: split the initialis
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Prathamesh Kulkarni writes: > On Thu, 12 Jan 2023 at 21:21, Richard Sandiford > wrote: >> >> Prathamesh Kulkarni writes: >> > On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni >> > wrote: >> >> >> >> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford >> >> wrote: >> >> > >> >> > Richard Sandiford via Gcc-patches writes: >> >> > > Prathamesh Kulkarni writes: >> >> > >> Hi, >> >> > >> For the following test-case: >> >> > >> >> >> > >> int16x8_t foo(int16_t x, int16_t y) >> >> > >> { >> >> > >> return (int16x8_t) { x, y, x, y, x, y, x, y }; >> >> > >> } >> >> > >> >> >> > >> Code gen at -O3: >> >> > >> foo: >> >> > >> dupv0.8h, w0 >> >> > >> ins v0.h[1], w1 >> >> > >> ins v0.h[3], w1 >> >> > >> ins v0.h[5], w1 >> >> > >> ins v0.h[7], w1 >> >> > >> ret >> >> > >> >> >> > >> For 16 elements, it results in 8 ins instructions which might not be >> >> > >> optimal perhaps. >> >> > >> I guess, the above code-gen would be equivalent to the following ? >> >> > >> dup v0.8h, w0 >> >> > >> dup v1.8h, w1 >> >> > >> zip1 v0.8h, v0.8h, v1.8h >> >> > >> >> >> > >> I have attached patch to do the same, if number of elements >= 8, >> >> > >> which should be possibly better compared to current code-gen ? >> >> > >> Patch passes bootstrap+test on aarch64-linux-gnu. >> >> > >> Does the patch look OK ? >> >> > >> >> >> > >> Thanks, >> >> > >> Prathamesh >> >> > >> >> >> > >> diff --git a/gcc/config/aarch64/aarch64.cc >> >> > >> b/gcc/config/aarch64/aarch64.cc >> >> > >> index c91df6f5006..e5dea70e363 100644 >> >> > >> --- a/gcc/config/aarch64/aarch64.cc >> >> > >> +++ b/gcc/config/aarch64/aarch64.cc >> >> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx >> >> > >> vals) >> >> > >>return; >> >> > >> } >> >> > >> >> >> > >> + /* Check for interleaving case. >> >> > >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. >> >> > >> + Generate following code: >> >> > >> + dup v0.h, x >> >> > >> + dup v1.h, y >> >> > >> + zip1 v0.h, v0.h, v1.h >> >> > >> + for "large enough" initializer. */ >> >> > >> + >> >> > >> + if (n_elts >= 8) >> >> > >> +{ >> >> > >> + int i; >> >> > >> + for (i = 2; i < n_elts; i++) >> >> > >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % >> >> > >> 2))) >> >> > >> + break; >> >> > >> + >> >> > >> + if (i == n_elts) >> >> > >> +{ >> >> > >> + machine_mode mode = GET_MODE (target); >> >> > >> + rtx dest[2]; >> >> > >> + >> >> > >> + for (int i = 0; i < 2; i++) >> >> > >> +{ >> >> > >> + rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP >> >> > >> (vals, 0, i)); >> >> > > >> >> > > Formatting nit: long line. >> >> > > >> >> > >> + dest[i] = gen_reg_rtx (mode); >> >> > >> + aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x)); >> >> > >> +} >> >> > > >> >> > > This could probably be written: >> >> > > >> >> > > for (int i = 0; i < 2; i++) >> >> > > { >> >> > > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, >> >> > > i)); >> >> > > dest[i] = force_reg (GET_MODE_INNER (mode), x); >> >> > >> >> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry. >> >> Thanks, I have pushed the change in >> >> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running >> >> bootstrap+test on aarch64-linux-gnu. >> > Hi Richard, >> > I have attached a patch that extends the transform if one half is dup >> > and other is set of constants. >> > For eg: >> > int8x16_t f(int8_t x) >> > { >> > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 }; >> > } >> > >> > code-gen trunk: >> > f: >> > adrpx1, .LC0 >> > ldr q0, [x1, #:lo12:.LC0] >> > ins v0.b[0], w0 >> > ins v0.b[2], w0 >> > ins v0.b[4], w0 >> > ins v0.b[6], w0 >> > ins v0.b[8], w0 >> > ins v0.b[10], w0 >> > ins v0.b[12], w0 >> > ins v0.b[14], w0 >> > ret >> > >> > code-gen with patch: >> > f: >> > dup v0.16b, w0 >> > adrpx0, .LC0 >> > ldr q1, [x0, #:lo12:.LC0] >> > zip1v0.16b, v0.16b, v1.16b >> > ret >> > >> > Bootstrapped+tested on aarch64-linux-gnu. >> > Does it look OK ? >> >> Looks like a nice improvement. It'll need to wait for GCC 14 now though. >> >> However, rather than handle this case specially, I think we should instead >> take a divide-and-conquer approach: split the initialiser into even and >> odd elements, find the best way of loading each part, then compare the >> cost of these sequences + ZIP with the cost of the fallback code (the code >> later in aarch64_expand_vector_init). >> >> For example, doing that would allow: >> >> { x, y, 0, y, 0, y, 0, y, 0, y } >> >> to be loaded more easily, even though the even elements aren't wholly >>
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Thu, 12 Jan 2023 at 21:21, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni > > wrote: > >> > >> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford > >> wrote: > >> > > >> > Richard Sandiford via Gcc-patches writes: > >> > > Prathamesh Kulkarni writes: > >> > >> Hi, > >> > >> For the following test-case: > >> > >> > >> > >> int16x8_t foo(int16_t x, int16_t y) > >> > >> { > >> > >> return (int16x8_t) { x, y, x, y, x, y, x, y }; > >> > >> } > >> > >> > >> > >> Code gen at -O3: > >> > >> foo: > >> > >> dupv0.8h, w0 > >> > >> ins v0.h[1], w1 > >> > >> ins v0.h[3], w1 > >> > >> ins v0.h[5], w1 > >> > >> ins v0.h[7], w1 > >> > >> ret > >> > >> > >> > >> For 16 elements, it results in 8 ins instructions which might not be > >> > >> optimal perhaps. > >> > >> I guess, the above code-gen would be equivalent to the following ? > >> > >> dup v0.8h, w0 > >> > >> dup v1.8h, w1 > >> > >> zip1 v0.8h, v0.8h, v1.8h > >> > >> > >> > >> I have attached patch to do the same, if number of elements >= 8, > >> > >> which should be possibly better compared to current code-gen ? > >> > >> Patch passes bootstrap+test on aarch64-linux-gnu. > >> > >> Does the patch look OK ? > >> > >> > >> > >> Thanks, > >> > >> Prathamesh > >> > >> > >> > >> diff --git a/gcc/config/aarch64/aarch64.cc > >> > >> b/gcc/config/aarch64/aarch64.cc > >> > >> index c91df6f5006..e5dea70e363 100644 > >> > >> --- a/gcc/config/aarch64/aarch64.cc > >> > >> +++ b/gcc/config/aarch64/aarch64.cc > >> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx > >> > >> vals) > >> > >>return; > >> > >> } > >> > >> > >> > >> + /* Check for interleaving case. > >> > >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. > >> > >> + Generate following code: > >> > >> + dup v0.h, x > >> > >> + dup v1.h, y > >> > >> + zip1 v0.h, v0.h, v1.h > >> > >> + for "large enough" initializer. */ > >> > >> + > >> > >> + if (n_elts >= 8) > >> > >> +{ > >> > >> + int i; > >> > >> + for (i = 2; i < n_elts; i++) > >> > >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % > >> > >> 2))) > >> > >> + break; > >> > >> + > >> > >> + if (i == n_elts) > >> > >> +{ > >> > >> + machine_mode mode = GET_MODE (target); > >> > >> + rtx dest[2]; > >> > >> + > >> > >> + for (int i = 0; i < 2; i++) > >> > >> +{ > >> > >> + rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP > >> > >> (vals, 0, i)); > >> > > > >> > > Formatting nit: long line. > >> > > > >> > >> + dest[i] = gen_reg_rtx (mode); > >> > >> + aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x)); > >> > >> +} > >> > > > >> > > This could probably be written: > >> > > > >> > > for (int i = 0; i < 2; i++) > >> > > { > >> > > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, > >> > > i)); > >> > > dest[i] = force_reg (GET_MODE_INNER (mode), x); > >> > > >> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry. > >> Thanks, I have pushed the change in > >> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running > >> bootstrap+test on aarch64-linux-gnu. > > Hi Richard, > > I have attached a patch that extends the transform if one half is dup > > and other is set of constants. > > For eg: > > int8x16_t f(int8_t x) > > { > > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 }; > > } > > > > code-gen trunk: > > f: > > adrpx1, .LC0 > > ldr q0, [x1, #:lo12:.LC0] > > ins v0.b[0], w0 > > ins v0.b[2], w0 > > ins v0.b[4], w0 > > ins v0.b[6], w0 > > ins v0.b[8], w0 > > ins v0.b[10], w0 > > ins v0.b[12], w0 > > ins v0.b[14], w0 > > ret > > > > code-gen with patch: > > f: > > dup v0.16b, w0 > > adrpx0, .LC0 > > ldr q1, [x0, #:lo12:.LC0] > > zip1v0.16b, v0.16b, v1.16b > > ret > > > > Bootstrapped+tested on aarch64-linux-gnu. > > Does it look OK ? > > Looks like a nice improvement. It'll need to wait for GCC 14 now though. > > However, rather than handle this case specially, I think we should instead > take a divide-and-conquer approach: split the initialiser into even and > odd elements, find the best way of loading each part, then compare the > cost of these sequences + ZIP with the cost of the fallback code (the code > later in aarch64_expand_vector_init). > > For example, doing that would allow: > > { x, y, 0, y, 0, y, 0, y, 0, y } > > to be loaded more easily, even though the even elements aren't wholly > constant. Hi Richard, I have attached a prototype patch based on the above approach. It subsumes specializing for above {x, y, x, y, x, y, x, y} case by generating same sequence
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Prathamesh Kulkarni writes: > On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni > wrote: >> >> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford >> wrote: >> > >> > Richard Sandiford via Gcc-patches writes: >> > > Prathamesh Kulkarni writes: >> > >> Hi, >> > >> For the following test-case: >> > >> >> > >> int16x8_t foo(int16_t x, int16_t y) >> > >> { >> > >> return (int16x8_t) { x, y, x, y, x, y, x, y }; >> > >> } >> > >> >> > >> Code gen at -O3: >> > >> foo: >> > >> dupv0.8h, w0 >> > >> ins v0.h[1], w1 >> > >> ins v0.h[3], w1 >> > >> ins v0.h[5], w1 >> > >> ins v0.h[7], w1 >> > >> ret >> > >> >> > >> For 16 elements, it results in 8 ins instructions which might not be >> > >> optimal perhaps. >> > >> I guess, the above code-gen would be equivalent to the following ? >> > >> dup v0.8h, w0 >> > >> dup v1.8h, w1 >> > >> zip1 v0.8h, v0.8h, v1.8h >> > >> >> > >> I have attached patch to do the same, if number of elements >= 8, >> > >> which should be possibly better compared to current code-gen ? >> > >> Patch passes bootstrap+test on aarch64-linux-gnu. >> > >> Does the patch look OK ? >> > >> >> > >> Thanks, >> > >> Prathamesh >> > >> >> > >> diff --git a/gcc/config/aarch64/aarch64.cc >> > >> b/gcc/config/aarch64/aarch64.cc >> > >> index c91df6f5006..e5dea70e363 100644 >> > >> --- a/gcc/config/aarch64/aarch64.cc >> > >> +++ b/gcc/config/aarch64/aarch64.cc >> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx >> > >> vals) >> > >>return; >> > >> } >> > >> >> > >> + /* Check for interleaving case. >> > >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. >> > >> + Generate following code: >> > >> + dup v0.h, x >> > >> + dup v1.h, y >> > >> + zip1 v0.h, v0.h, v1.h >> > >> + for "large enough" initializer. */ >> > >> + >> > >> + if (n_elts >= 8) >> > >> +{ >> > >> + int i; >> > >> + for (i = 2; i < n_elts; i++) >> > >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2))) >> > >> + break; >> > >> + >> > >> + if (i == n_elts) >> > >> +{ >> > >> + machine_mode mode = GET_MODE (target); >> > >> + rtx dest[2]; >> > >> + >> > >> + for (int i = 0; i < 2; i++) >> > >> +{ >> > >> + rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP >> > >> (vals, 0, i)); >> > > >> > > Formatting nit: long line. >> > > >> > >> + dest[i] = gen_reg_rtx (mode); >> > >> + aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x)); >> > >> +} >> > > >> > > This could probably be written: >> > > >> > > for (int i = 0; i < 2; i++) >> > > { >> > > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i)); >> > > dest[i] = force_reg (GET_MODE_INNER (mode), x); >> > >> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry. >> Thanks, I have pushed the change in >> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running >> bootstrap+test on aarch64-linux-gnu. > Hi Richard, > I have attached a patch that extends the transform if one half is dup > and other is set of constants. > For eg: > int8x16_t f(int8_t x) > { > return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 }; > } > > code-gen trunk: > f: > adrpx1, .LC0 > ldr q0, [x1, #:lo12:.LC0] > ins v0.b[0], w0 > ins v0.b[2], w0 > ins v0.b[4], w0 > ins v0.b[6], w0 > ins v0.b[8], w0 > ins v0.b[10], w0 > ins v0.b[12], w0 > ins v0.b[14], w0 > ret > > code-gen with patch: > f: > dup v0.16b, w0 > adrpx0, .LC0 > ldr q1, [x0, #:lo12:.LC0] > zip1v0.16b, v0.16b, v1.16b > ret > > Bootstrapped+tested on aarch64-linux-gnu. > Does it look OK ? Looks like a nice improvement. It'll need to wait for GCC 14 now though. However, rather than handle this case specially, I think we should instead take a divide-and-conquer approach: split the initialiser into even and odd elements, find the best way of loading each part, then compare the cost of these sequences + ZIP with the cost of the fallback code (the code later in aarch64_expand_vector_init). For example, doing that would allow: { x, y, 0, y, 0, y, 0, y, 0, y } to be loaded more easily, even though the even elements aren't wholly constant. Thanks, Richard > > Thanks, > Prathamesh >> > >> Thanks, >> Prathamesh >> > >> > > } >> > > >> > > which avoids forcing constant elements into a register before the >> > > duplication. >> > > OK with that change if it works. >> > > >> > > Thanks, >> > > Richard >> > > >> > >> + >> > >> + rtvec v = gen_rtvec (2, dest[0], dest[1]); >> > >> + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); >> > >> + return; >> > >> +} >> > >> +} >> > >> + >> > >>enum insn_code icode
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni wrote: > > On Mon, 5 Dec 2022 at 16:50, Richard Sandiford > wrote: > > > > Richard Sandiford via Gcc-patches writes: > > > Prathamesh Kulkarni writes: > > >> Hi, > > >> For the following test-case: > > >> > > >> int16x8_t foo(int16_t x, int16_t y) > > >> { > > >> return (int16x8_t) { x, y, x, y, x, y, x, y }; > > >> } > > >> > > >> Code gen at -O3: > > >> foo: > > >> dupv0.8h, w0 > > >> ins v0.h[1], w1 > > >> ins v0.h[3], w1 > > >> ins v0.h[5], w1 > > >> ins v0.h[7], w1 > > >> ret > > >> > > >> For 16 elements, it results in 8 ins instructions which might not be > > >> optimal perhaps. > > >> I guess, the above code-gen would be equivalent to the following ? > > >> dup v0.8h, w0 > > >> dup v1.8h, w1 > > >> zip1 v0.8h, v0.8h, v1.8h > > >> > > >> I have attached patch to do the same, if number of elements >= 8, > > >> which should be possibly better compared to current code-gen ? > > >> Patch passes bootstrap+test on aarch64-linux-gnu. > > >> Does the patch look OK ? > > >> > > >> Thanks, > > >> Prathamesh > > >> > > >> diff --git a/gcc/config/aarch64/aarch64.cc > > >> b/gcc/config/aarch64/aarch64.cc > > >> index c91df6f5006..e5dea70e363 100644 > > >> --- a/gcc/config/aarch64/aarch64.cc > > >> +++ b/gcc/config/aarch64/aarch64.cc > > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx > > >> vals) > > >>return; > > >> } > > >> > > >> + /* Check for interleaving case. > > >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. > > >> + Generate following code: > > >> + dup v0.h, x > > >> + dup v1.h, y > > >> + zip1 v0.h, v0.h, v1.h > > >> + for "large enough" initializer. */ > > >> + > > >> + if (n_elts >= 8) > > >> +{ > > >> + int i; > > >> + for (i = 2; i < n_elts; i++) > > >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2))) > > >> + break; > > >> + > > >> + if (i == n_elts) > > >> +{ > > >> + machine_mode mode = GET_MODE (target); > > >> + rtx dest[2]; > > >> + > > >> + for (int i = 0; i < 2; i++) > > >> +{ > > >> + rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP > > >> (vals, 0, i)); > > > > > > Formatting nit: long line. > > > > > >> + dest[i] = gen_reg_rtx (mode); > > >> + aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x)); > > >> +} > > > > > > This could probably be written: > > > > > > for (int i = 0; i < 2; i++) > > > { > > > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i)); > > > dest[i] = force_reg (GET_MODE_INNER (mode), x); > > > > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry. > Thanks, I have pushed the change in > 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running > bootstrap+test on aarch64-linux-gnu. Hi Richard, I have attached a patch that extends the transform if one half is dup and other is set of constants. For eg: int8x16_t f(int8_t x) { return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 }; } code-gen trunk: f: adrpx1, .LC0 ldr q0, [x1, #:lo12:.LC0] ins v0.b[0], w0 ins v0.b[2], w0 ins v0.b[4], w0 ins v0.b[6], w0 ins v0.b[8], w0 ins v0.b[10], w0 ins v0.b[12], w0 ins v0.b[14], w0 ret code-gen with patch: f: dup v0.16b, w0 adrpx0, .LC0 ldr q1, [x0, #:lo12:.LC0] zip1v0.16b, v0.16b, v1.16b ret Bootstrapped+tested on aarch64-linux-gnu. Does it look OK ? Thanks, Prathamesh > > Thanks, > Prathamesh > > > > > } > > > > > > which avoids forcing constant elements into a register before the > > > duplication. > > > OK with that change if it works. > > > > > > Thanks, > > > Richard > > > > > >> + > > >> + rtvec v = gen_rtvec (2, dest[0], dest[1]); > > >> + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); > > >> + return; > > >> +} > > >> +} > > >> + > > >>enum insn_code icode = optab_handler (vec_set_optab, mode); > > >>gcc_assert (icode != CODE_FOR_nothing); > > >> > > >> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c > > >> b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c > > >> new file mode 100644 > > >> index 000..ee775048589 > > >> --- /dev/null > > >> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c > > >> @@ -0,0 +1,37 @@ > > >> +/* { dg-do compile } */ > > >> +/* { dg-options "-O3" } */ > > >> +/* { dg-final { check-function-bodies "**" "" "" } } */ > > >> + > > >> +#include > > >> + > > >> +/* > > >> +** foo: > > >> +** ... > > >> +** dup v[0-9]+\.8h, w[0-9]+ > > >> +** dup v[0-9]+\.8h, w[0-9]+ > > >> +** zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h > > >> +** ... > > >> +** ret >
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Mon, 5 Dec 2022 at 16:50, Richard Sandiford wrote: > > Richard Sandiford via Gcc-patches writes: > > Prathamesh Kulkarni writes: > >> Hi, > >> For the following test-case: > >> > >> int16x8_t foo(int16_t x, int16_t y) > >> { > >> return (int16x8_t) { x, y, x, y, x, y, x, y }; > >> } > >> > >> Code gen at -O3: > >> foo: > >> dupv0.8h, w0 > >> ins v0.h[1], w1 > >> ins v0.h[3], w1 > >> ins v0.h[5], w1 > >> ins v0.h[7], w1 > >> ret > >> > >> For 16 elements, it results in 8 ins instructions which might not be > >> optimal perhaps. > >> I guess, the above code-gen would be equivalent to the following ? > >> dup v0.8h, w0 > >> dup v1.8h, w1 > >> zip1 v0.8h, v0.8h, v1.8h > >> > >> I have attached patch to do the same, if number of elements >= 8, > >> which should be possibly better compared to current code-gen ? > >> Patch passes bootstrap+test on aarch64-linux-gnu. > >> Does the patch look OK ? > >> > >> Thanks, > >> Prathamesh > >> > >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > >> index c91df6f5006..e5dea70e363 100644 > >> --- a/gcc/config/aarch64/aarch64.cc > >> +++ b/gcc/config/aarch64/aarch64.cc > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals) > >>return; > >> } > >> > >> + /* Check for interleaving case. > >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. > >> + Generate following code: > >> + dup v0.h, x > >> + dup v1.h, y > >> + zip1 v0.h, v0.h, v1.h > >> + for "large enough" initializer. */ > >> + > >> + if (n_elts >= 8) > >> +{ > >> + int i; > >> + for (i = 2; i < n_elts; i++) > >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2))) > >> + break; > >> + > >> + if (i == n_elts) > >> +{ > >> + machine_mode mode = GET_MODE (target); > >> + rtx dest[2]; > >> + > >> + for (int i = 0; i < 2; i++) > >> +{ > >> + rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, > >> 0, i)); > > > > Formatting nit: long line. > > > >> + dest[i] = gen_reg_rtx (mode); > >> + aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x)); > >> +} > > > > This could probably be written: > > > > for (int i = 0; i < 2; i++) > > { > > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i)); > > dest[i] = force_reg (GET_MODE_INNER (mode), x); > > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry. Thanks, I have pushed the change in 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running bootstrap+test on aarch64-linux-gnu. Thanks, Prathamesh > > > } > > > > which avoids forcing constant elements into a register before the > > duplication. > > OK with that change if it works. > > > > Thanks, > > Richard > > > >> + > >> + rtvec v = gen_rtvec (2, dest[0], dest[1]); > >> + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); > >> + return; > >> +} > >> +} > >> + > >>enum insn_code icode = optab_handler (vec_set_optab, mode); > >>gcc_assert (icode != CODE_FOR_nothing); > >> > >> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c > >> b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c > >> new file mode 100644 > >> index 000..ee775048589 > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c > >> @@ -0,0 +1,37 @@ > >> +/* { dg-do compile } */ > >> +/* { dg-options "-O3" } */ > >> +/* { dg-final { check-function-bodies "**" "" "" } } */ > >> + > >> +#include > >> + > >> +/* > >> +** foo: > >> +** ... > >> +** dup v[0-9]+\.8h, w[0-9]+ > >> +** dup v[0-9]+\.8h, w[0-9]+ > >> +** zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h > >> +** ... > >> +** ret > >> +*/ > >> + > >> +int16x8_t foo(int16_t x, int y) > >> +{ > >> + int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; > >> + return v; > >> +} > >> + > >> +/* > >> +** foo2: > >> +** ... > >> +** dup v[0-9]+\.8h, w[0-9]+ > >> +** moviv[0-9]+\.8h, 0x1 > >> +** zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h > >> +** ... > >> +** ret > >> +*/ > >> + > >> +int16x8_t foo2(int16_t x) > >> +{ > >> + int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; > >> + return v; > >> +}
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Richard Sandiford via Gcc-patches writes: > Prathamesh Kulkarni writes: >> Hi, >> For the following test-case: >> >> int16x8_t foo(int16_t x, int16_t y) >> { >> return (int16x8_t) { x, y, x, y, x, y, x, y }; >> } >> >> Code gen at -O3: >> foo: >> dupv0.8h, w0 >> ins v0.h[1], w1 >> ins v0.h[3], w1 >> ins v0.h[5], w1 >> ins v0.h[7], w1 >> ret >> >> For 16 elements, it results in 8 ins instructions which might not be >> optimal perhaps. >> I guess, the above code-gen would be equivalent to the following ? >> dup v0.8h, w0 >> dup v1.8h, w1 >> zip1 v0.8h, v0.8h, v1.8h >> >> I have attached patch to do the same, if number of elements >= 8, >> which should be possibly better compared to current code-gen ? >> Patch passes bootstrap+test on aarch64-linux-gnu. >> Does the patch look OK ? >> >> Thanks, >> Prathamesh >> >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc >> index c91df6f5006..e5dea70e363 100644 >> --- a/gcc/config/aarch64/aarch64.cc >> +++ b/gcc/config/aarch64/aarch64.cc >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals) >>return; >> } >> >> + /* Check for interleaving case. >> + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. >> + Generate following code: >> + dup v0.h, x >> + dup v1.h, y >> + zip1 v0.h, v0.h, v1.h >> + for "large enough" initializer. */ >> + >> + if (n_elts >= 8) >> +{ >> + int i; >> + for (i = 2; i < n_elts; i++) >> +if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2))) >> + break; >> + >> + if (i == n_elts) >> +{ >> + machine_mode mode = GET_MODE (target); >> + rtx dest[2]; >> + >> + for (int i = 0; i < 2; i++) >> +{ >> + rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, >> 0, i)); > > Formatting nit: long line. > >> + dest[i] = gen_reg_rtx (mode); >> + aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x)); >> +} > > This could probably be written: > > for (int i = 0; i < 2; i++) > { > rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i)); > dest[i] = force_reg (GET_MODE_INNER (mode), x); Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry. > } > > which avoids forcing constant elements into a register before the duplication. > OK with that change if it works. > > Thanks, > Richard > >> + >> + rtvec v = gen_rtvec (2, dest[0], dest[1]); >> + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); >> + return; >> +} >> +} >> + >>enum insn_code icode = optab_handler (vec_set_optab, mode); >>gcc_assert (icode != CODE_FOR_nothing); >> >> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c >> b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c >> new file mode 100644 >> index 000..ee775048589 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c >> @@ -0,0 +1,37 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O3" } */ >> +/* { dg-final { check-function-bodies "**" "" "" } } */ >> + >> +#include >> + >> +/* >> +** foo: >> +** ... >> +** dup v[0-9]+\.8h, w[0-9]+ >> +** dup v[0-9]+\.8h, w[0-9]+ >> +** zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h >> +** ... >> +** ret >> +*/ >> + >> +int16x8_t foo(int16_t x, int y) >> +{ >> + int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; >> + return v; >> +} >> + >> +/* >> +** foo2: >> +** ... >> +** dup v[0-9]+\.8h, w[0-9]+ >> +** moviv[0-9]+\.8h, 0x1 >> +** zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h >> +** ... >> +** ret >> +*/ >> + >> +int16x8_t foo2(int16_t x) >> +{ >> + int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; >> + return v; >> +}
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
Prathamesh Kulkarni writes: > Hi, > For the following test-case: > > int16x8_t foo(int16_t x, int16_t y) > { > return (int16x8_t) { x, y, x, y, x, y, x, y }; > } > > Code gen at -O3: > foo: > dupv0.8h, w0 > ins v0.h[1], w1 > ins v0.h[3], w1 > ins v0.h[5], w1 > ins v0.h[7], w1 > ret > > For 16 elements, it results in 8 ins instructions which might not be > optimal perhaps. > I guess, the above code-gen would be equivalent to the following ? > dup v0.8h, w0 > dup v1.8h, w1 > zip1 v0.8h, v0.8h, v1.8h > > I have attached patch to do the same, if number of elements >= 8, > which should be possibly better compared to current code-gen ? > Patch passes bootstrap+test on aarch64-linux-gnu. > Does the patch look OK ? > > Thanks, > Prathamesh > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index c91df6f5006..e5dea70e363 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals) >return; > } > > + /* Check for interleaving case. > + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. > + Generate following code: > + dup v0.h, x > + dup v1.h, y > + zip1 v0.h, v0.h, v1.h > + for "large enough" initializer. */ > + > + if (n_elts >= 8) > +{ > + int i; > + for (i = 2; i < n_elts; i++) > + if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2))) > + break; > + > + if (i == n_elts) > + { > + machine_mode mode = GET_MODE (target); > + rtx dest[2]; > + > + for (int i = 0; i < 2; i++) > + { > + rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, > 0, i)); Formatting nit: long line. > + dest[i] = gen_reg_rtx (mode); > + aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x)); > + } This could probably be written: for (int i = 0; i < 2; i++) { rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i)); dest[i] = force_reg (GET_MODE_INNER (mode), x); } which avoids forcing constant elements into a register before the duplication. OK with that change if it works. Thanks, Richard > + > + rtvec v = gen_rtvec (2, dest[0], dest[1]); > + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); > + return; > + } > +} > + >enum insn_code icode = optab_handler (vec_set_optab, mode); >gcc_assert (icode != CODE_FOR_nothing); > > diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c > b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c > new file mode 100644 > index 000..ee775048589 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c > @@ -0,0 +1,37 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +#include > + > +/* > +** foo: > +** ... > +** dup v[0-9]+\.8h, w[0-9]+ > +** dup v[0-9]+\.8h, w[0-9]+ > +** zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h > +** ... > +** ret > +*/ > + > +int16x8_t foo(int16_t x, int y) > +{ > + int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; > + return v; > +} > + > +/* > +** foo2: > +** ... > +** dup v[0-9]+\.8h, w[0-9]+ > +** moviv[0-9]+\.8h, 0x1 > +** zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h > +** ... > +** ret > +*/ > + > +int16x8_t foo2(int16_t x) > +{ > + int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; > + return v; > +}
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Tue, 29 Nov 2022 at 20:43, Andrew Pinski wrote: > > On Tue, Nov 29, 2022 at 6:40 AM Prathamesh Kulkarni via Gcc-patches > wrote: > > > > Hi, > > For the following test-case: > > > > int16x8_t foo(int16_t x, int16_t y) > > { > > return (int16x8_t) { x, y, x, y, x, y, x, y }; > > } > > (Not to block this patch) > Seems like this trick can be done even with less than perfect initializer too: > e.g. > int16x8_t foo(int16_t x, int16_t y) > { > return (int16x8_t) { x, y, x, y, x, y, x, 0 }; > } > > Which should generate something like: > dup v0.8h, w0 > dup v1.8h, w1 > zip1 v0.8h, v0.8h, v1.8h > ins v0.h[7], wzr Hi Andrew, Nice catch, thanks for the suggestions! More generally, code-gen with constants involved seems to be sub-optimal. For example: int16x8_t foo(int16_t x) { return (int16x8_t) { x, x, x, x, x, x, x, 1 }; } results in: foo: moviv0.8h, 0x1 ins v0.h[0], w0 ins v0.h[1], w0 ins v0.h[2], w0 ins v0.h[3], w0 ins v0.h[4], w0 ins v0.h[5], w0 ins v0.h[6], w0 ret which I suppose could instead be the following ? foo: dup v0.8h, w0 movw1, 0x1 ins v0.h[7], w1 ret I will try to address this in follow up patch. Thanks, Prathamesh > > Thanks, > Andrew Pinski > > > > > > Code gen at -O3: > > foo: > > dupv0.8h, w0 > > ins v0.h[1], w1 > > ins v0.h[3], w1 > > ins v0.h[5], w1 > > ins v0.h[7], w1 > > ret > > > > For 16 elements, it results in 8 ins instructions which might not be > > optimal perhaps. > > I guess, the above code-gen would be equivalent to the following ? > > dup v0.8h, w0 > > dup v1.8h, w1 > > zip1 v0.8h, v0.8h, v1.8h > > > > I have attached patch to do the same, if number of elements >= 8, > > which should be possibly better compared to current code-gen ? > > Patch passes bootstrap+test on aarch64-linux-gnu. > > Does the patch look OK ? > > > > Thanks, > > Prathamesh
Re: [aarch64] Use dup and zip1 for interleaving elements in initializing vector
On Tue, Nov 29, 2022 at 6:40 AM Prathamesh Kulkarni via Gcc-patches wrote: > > Hi, > For the following test-case: > > int16x8_t foo(int16_t x, int16_t y) > { > return (int16x8_t) { x, y, x, y, x, y, x, y }; > } (Not to block this patch) Seems like this trick can be done even with less than perfect initializer too: e.g. int16x8_t foo(int16_t x, int16_t y) { return (int16x8_t) { x, y, x, y, x, y, x, 0 }; } Which should generate something like: dup v0.8h, w0 dup v1.8h, w1 zip1 v0.8h, v0.8h, v1.8h ins v0.h[7], wzr Thanks, Andrew Pinski > > Code gen at -O3: > foo: > dupv0.8h, w0 > ins v0.h[1], w1 > ins v0.h[3], w1 > ins v0.h[5], w1 > ins v0.h[7], w1 > ret > > For 16 elements, it results in 8 ins instructions which might not be > optimal perhaps. > I guess, the above code-gen would be equivalent to the following ? > dup v0.8h, w0 > dup v1.8h, w1 > zip1 v0.8h, v0.8h, v1.8h > > I have attached patch to do the same, if number of elements >= 8, > which should be possibly better compared to current code-gen ? > Patch passes bootstrap+test on aarch64-linux-gnu. > Does the patch look OK ? > > Thanks, > Prathamesh
[aarch64] Use dup and zip1 for interleaving elements in initializing vector
Hi, For the following test-case: int16x8_t foo(int16_t x, int16_t y) { return (int16x8_t) { x, y, x, y, x, y, x, y }; } Code gen at -O3: foo: dupv0.8h, w0 ins v0.h[1], w1 ins v0.h[3], w1 ins v0.h[5], w1 ins v0.h[7], w1 ret For 16 elements, it results in 8 ins instructions which might not be optimal perhaps. I guess, the above code-gen would be equivalent to the following ? dup v0.8h, w0 dup v1.8h, w1 zip1 v0.8h, v0.8h, v1.8h I have attached patch to do the same, if number of elements >= 8, which should be possibly better compared to current code-gen ? Patch passes bootstrap+test on aarch64-linux-gnu. Does the patch look OK ? Thanks, Prathamesh diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index c91df6f5006..e5dea70e363 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals) return; } + /* Check for interleaving case. + For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}. + Generate following code: + dup v0.h, x + dup v1.h, y + zip1 v0.h, v0.h, v1.h + for "large enough" initializer. */ + + if (n_elts >= 8) +{ + int i; + for (i = 2; i < n_elts; i++) + if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2))) + break; + + if (i == n_elts) + { + machine_mode mode = GET_MODE (target); + rtx dest[2]; + + for (int i = 0; i < 2; i++) + { + rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i)); + dest[i] = gen_reg_rtx (mode); + aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x)); + } + + rtvec v = gen_rtvec (2, dest[0], dest[1]); + emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); + return; + } +} + enum insn_code icode = optab_handler (vec_set_optab, mode); gcc_assert (icode != CODE_FOR_nothing); diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c new file mode 100644 index 000..ee775048589 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include + +/* +** foo: +** ... +** dup v[0-9]+\.8h, w[0-9]+ +** dup v[0-9]+\.8h, w[0-9]+ +** zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h +** ... +** ret +*/ + +int16x8_t foo(int16_t x, int y) +{ + int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; + return v; +} + +/* +** foo2: +** ... +** dup v[0-9]+\.8h, w[0-9]+ +** moviv[0-9]+\.8h, 0x1 +** zip1v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h +** ... +** ret +*/ + +int16x8_t foo2(int16_t x) +{ + int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; + return v; +}