Hi, when the source mode is potentially larger than one vector (e.g. an LMUL2 mode for VLEN=128) we don't know which vector the subreg actually refers to. For zvl128b and LMUL=2 the subreg in (subreg:V2DI (reg:V4DI)) could actually be the a full (high) vector register of a two-register group (at VLEN=128) or the higher part of a single register (at VLEN>128).
In that case we need to use a slidedown instead of moving a register directly. Regtested on rv64gcv_zvfh_zvbb -mrvv-max-lmul=m2 at vlen 128 and vlen 256. This also fixes gcc.dg/vect/bb-slp-cond-1.c gcc.dg/vect/bb-slp-pr101668.c gcc.dg/vect/pr66251.c and others from the vector test suite when ran with vlen 256. Regtested on rv64gcv_zvfh_zvbb -mrvv-max-lmul=m2 and vlen 128 as well as vlen 256. Still curious what the CI says. Regards Robin gcc/ChangeLog: PR target/116086 * config/riscv/riscv-v.cc (legitimize_move): Slide down instead of moving register directly. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr116086-2-run.c: New test. * gcc.target/riscv/rvv/autovec/pr116086-2.c: New test. * gcc.target/riscv/rvv/autovec/pr116086.c: New test. --- gcc/config/riscv/riscv-v.cc | 29 +++++++ .../riscv/rvv/autovec/pr116086-2-run.c | 5 ++ .../gcc.target/riscv/rvv/autovec/pr116086-2.c | 18 +++++ .../gcc.target/riscv/rvv/autovec/pr116086.c | 75 +++++++++++++++++++ 4 files changed, 127 insertions(+) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index e290675bbf0..f475fa32173 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -1550,6 +1550,35 @@ legitimize_move (rtx dest, rtx *srcp) if (riscv_v_ext_vls_mode_p (mode)) { + /* When the source mode is larger than one vector (like an LMUL2 mode + for VLEN=128) we don't know which vector the subreg actually + refers to. + For zvl128 the subreg in (subreg:V2DI (reg:V4DI)) could actually be + the full (high) vector register of a register group (at VLEN=128) or + the higher part of a single register (at VLEN>128). */ + if (SUBREG_P (src) && SUBREG_BYTE (src).to_constant () > 0) + { + rtx reg = SUBREG_REG (src); + machine_mode reg_mode = GET_MODE (reg); + + if (cmp_lmul_gt_one (reg_mode) + && GET_MODE_SIZE (reg_mode).to_constant () + > GET_MODE_SIZE (mode).to_constant () + && GET_MODE_INNER (reg_mode) == GET_MODE_INNER (mode)) + { + int slide = (SUBREG_BYTE (src).to_constant () + / GET_MODE_SIZE (mode).to_constant ()) * + GET_MODE_NUNITS (mode).to_constant (); + rtx tmp = gen_reg_rtx (reg_mode); + rtx slide_ops[] = {tmp, reg, GEN_INT (slide)}; + insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, + reg_mode); + emit_vlmax_insn (icode, BINARY_OP, slide_ops); + emit_move_insn (dest, gen_lowpart (mode, tmp)); + return true; + } + } + if (GET_MODE_NUNITS (mode).to_constant () <= 31) { /* For NUNITS <= 31 VLS modes, we don't need extrac diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c new file mode 100644 index 00000000000..2d523f98f7b --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c @@ -0,0 +1,5 @@ +/* { dg-do run } */ +/* { dg-require-effective-target riscv_v } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m2" } */ + +#include "pr116086-2.c" diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c new file mode 100644 index 00000000000..8b5ea6be955 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m2" } */ + +long a; +long b; +long c[80]; +int main() { + for (int d = 0; d < 16; d++) + c[d] = a; + for (int d = 16; d < 80; d++) + c[d] = c[d - 2]; + for (int d = 0; d < 80; d += 8) + b += c[d]; + if (b != 0) + __builtin_abort (); +} + +/* { dg-final { scan-assembler-times "vmv1r" 0 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c new file mode 100644 index 00000000000..cc67357b768 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c @@ -0,0 +1,75 @@ +/* { dg-do run } */ +/* { dg-require-effective-target riscv_v } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m2" } */ + +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; + +typedef struct +{ + uint64_t length; + uint64_t state[8]; + uint32_t curlen; + unsigned char buf[128]; +} sha512_state; + +static uint64_t load64(const unsigned char* y) +{ + uint64_t res = 0; + for(int i = 0; i != 8; ++i) + res |= (uint64_t)(y[i]) << ((7-i) * 8); + return res; +} + +static const uint64_t K[80] = +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + +__attribute__ ((noipa)) +static void sha_compress(sha512_state *md, const unsigned char *buf) +{ + uint64_t S[8], W[80]; + + for(int i = 0; i < 8; i++) + S[i] = 0; + + // Copy the state into 1024-bits into W[0..15] + for(int i = 0; i < 16; i++) + W[i] = load64(buf + (8*i)); + + // Fill W[16..79] + for(int i = 16; i < 80; i++) + W[i] = W[i - 2] + W[i - 7] + W[i - 15] + W[i - 16]; + + S[7] = W[72]; + + // Feedback + for(int i = 0; i < 8; i++) + md->state[i] = md->state[i] + S[i]; +} + +int main () +{ + sha512_state md; + md.curlen = 0; + md.length = 0; + md.state[0] = 0; + md.state[1] = 0; + md.state[2] = 0; + md.state[3] = 0; + md.state[4] = 0; + md.state[5] = 0; + md.state[6] = 0; + md.state[7] = 0; + + for (int i = 0; i < 128; i++) + md.buf[i] = 0; + + md.buf[md.curlen++] = (unsigned char)0x80; + + sha_compress (&md, md.buf); + + if (md.state[7] != 0x8000000000000000ULL) + __builtin_abort (); + + return 0; +} -- 2.45.2