Hi,

when the source mode is potentially larger than one vector (e.g. an
LMUL2 mode for VLEN=128) we don't know which vector the subreg actually
refers to.  For zvl128b and LMUL=2 the subreg in (subreg:V2DI (reg:V4DI))
could actually be the a full (high) vector register of a two-register
group (at VLEN=128) or the higher part of a single register (at VLEN>128).

In that case we need to use a slidedown instead of moving a register
directly.

Regtested on rv64gcv_zvfh_zvbb -mrvv-max-lmul=m2 at vlen 128 and vlen 256.
This also fixes
  gcc.dg/vect/bb-slp-cond-1.c
  gcc.dg/vect/bb-slp-pr101668.c
  gcc.dg/vect/pr66251.c
and others from the vector test suite when ran with vlen 256.

Regtested on rv64gcv_zvfh_zvbb -mrvv-max-lmul=m2 and vlen 128 as well as vlen
256.  Still curious what the CI says.

Regards
 Robin

gcc/ChangeLog:

        PR target/116086

        * config/riscv/riscv-v.cc (legitimize_move): Slide down instead
        of moving register directly.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/pr116086-2-run.c: New test.
        * gcc.target/riscv/rvv/autovec/pr116086-2.c: New test.
        * gcc.target/riscv/rvv/autovec/pr116086.c: New test.
---
 gcc/config/riscv/riscv-v.cc                   | 29 +++++++
 .../riscv/rvv/autovec/pr116086-2-run.c        |  5 ++
 .../gcc.target/riscv/rvv/autovec/pr116086-2.c | 18 +++++
 .../gcc.target/riscv/rvv/autovec/pr116086.c   | 75 +++++++++++++++++++
 4 files changed, 127 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index e290675bbf0..f475fa32173 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -1550,6 +1550,35 @@ legitimize_move (rtx dest, rtx *srcp)
 
   if (riscv_v_ext_vls_mode_p (mode))
     {
+      /* When the source mode is larger than one vector (like an LMUL2 mode
+        for VLEN=128) we don't know which vector the subreg actually
+        refers to.
+        For zvl128 the subreg in (subreg:V2DI (reg:V4DI)) could actually be
+        the full (high) vector register of a register group (at VLEN=128) or
+        the higher part of a single register (at VLEN>128).  */
+      if (SUBREG_P (src) && SUBREG_BYTE (src).to_constant () > 0)
+       {
+         rtx reg = SUBREG_REG (src);
+         machine_mode reg_mode = GET_MODE (reg);
+
+         if (cmp_lmul_gt_one (reg_mode)
+             && GET_MODE_SIZE (reg_mode).to_constant ()
+             > GET_MODE_SIZE (mode).to_constant ()
+             && GET_MODE_INNER (reg_mode) == GET_MODE_INNER (mode))
+           {
+             int slide = (SUBREG_BYTE (src).to_constant ()
+               / GET_MODE_SIZE (mode).to_constant ()) *
+               GET_MODE_NUNITS (mode).to_constant ();
+             rtx tmp = gen_reg_rtx (reg_mode);
+             rtx slide_ops[] = {tmp, reg, GEN_INT (slide)};
+             insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN,
+                                                    reg_mode);
+             emit_vlmax_insn (icode, BINARY_OP, slide_ops);
+             emit_move_insn (dest, gen_lowpart (mode, tmp));
+             return true;
+           }
+       }
+
       if (GET_MODE_NUNITS (mode).to_constant () <= 31)
        {
          /* For NUNITS <= 31 VLS modes, we don't need extrac
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c
new file mode 100644
index 00000000000..2d523f98f7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-require-effective-target riscv_v } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m2" } */
+
+#include "pr116086-2.c"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c
new file mode 100644
index 00000000000..8b5ea6be955
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m2" } */
+
+long a;
+long b;
+long c[80];
+int main() {
+    for (int d = 0; d < 16; d++)
+      c[d] = a;
+    for (int d = 16; d < 80; d++)
+      c[d] = c[d - 2];
+    for (int d = 0; d < 80; d += 8)
+      b += c[d];
+    if (b != 0)
+      __builtin_abort ();
+}
+
+/* { dg-final { scan-assembler-times "vmv1r" 0 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c
new file mode 100644
index 00000000000..cc67357b768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c
@@ -0,0 +1,75 @@
+/* { dg-do run } */
+/* { dg-require-effective-target riscv_v } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m2" } */
+
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+typedef struct
+{
+    uint64_t length;
+    uint64_t state[8];
+    uint32_t curlen;
+    unsigned char buf[128];
+} sha512_state;
+
+static uint64_t load64(const unsigned char* y)
+{
+    uint64_t res = 0;
+    for(int i = 0; i != 8; ++i)
+        res |= (uint64_t)(y[i]) << ((7-i) * 8);
+    return res;
+}
+
+static const uint64_t K[80] =
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0 };
+
+__attribute__ ((noipa))
+static void sha_compress(sha512_state *md, const unsigned char *buf)
+{
+    uint64_t S[8], W[80];
+
+    for(int i = 0; i < 8; i++)
+      S[i] = 0;
+
+    // Copy the state into 1024-bits into W[0..15]
+    for(int i = 0; i < 16; i++)
+      W[i] = load64(buf + (8*i));
+
+    // Fill W[16..79]
+    for(int i = 16; i < 80; i++)
+      W[i] = W[i - 2] + W[i - 7] + W[i - 15] + W[i - 16];
+
+    S[7] = W[72];
+
+     // Feedback
+    for(int i = 0; i < 8; i++)
+      md->state[i] = md->state[i] + S[i];
+}
+
+int main ()
+{
+  sha512_state md;
+  md.curlen = 0;
+  md.length = 0;
+  md.state[0] = 0;
+  md.state[1] = 0;
+  md.state[2] = 0;
+  md.state[3] = 0;
+  md.state[4] = 0;
+  md.state[5] = 0;
+  md.state[6] = 0;
+  md.state[7] = 0;
+
+  for (int i = 0; i < 128; i++)
+    md.buf[i] = 0;
+
+  md.buf[md.curlen++] = (unsigned char)0x80;
+
+  sha_compress (&md, md.buf);
+
+  if (md.state[7] != 0x8000000000000000ULL)
+    __builtin_abort ();
+
+  return 0;
+}
-- 
2.45.2

Reply via email to