On 6/18/23 17:06, Juzhe-Zhong wrote:
This patch is a propsal patch is **NOT** ready to push since
after this patch the total machine modes will exceed 255 which will create ICE
in LTO:
   internal compiler error: in bp_pack_int_in_range, at data-streamer.h:290
Right. Note that an ack from Jakub or Richi will be sufficient for the LTO fixes to go forward.



The reason we need to add VLS modes for following reason:
1. Enhance GNU vectors codegen:
    For example:
      typedef int32_t vnx8si __attribute__ ((vector_size (32)));

      __attribute__ ((noipa)) void
      f_vnx8si (int32_t * in, int32_t * out)
      {
        vnx8si v = *(vnx8si*)in;
        *(vnx8si *) out = v;
      }

compile option: --param=riscv-autovec-preference=scalable
     before this patch:
     f_vnx8si:
         ld      a2,0(a0)
         ld      a3,8(a0)
         ld      a4,16(a0)
         ld      a5,24(a0)
         addi    sp,sp,-32
         sd      a2,0(a1)
         sd      a3,8(a1)
         sd      a4,16(a1)
         sd      a5,24(a1)
         addi    sp,sp,32
         jr      ra

After this patch:
        f_vnx8si:
         vsetivli        zero,8,e32,m2,ta,ma
         vle32.v v2,0(a0)
         vse32.v v2,0(a1)
         ret

2. Ehance VLA SLP:
void
f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
{
   for (int i = 0; i < 100; ++i)
     {
       a[i * 8] = b[i * 8] + c[i * 8];
       a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
       a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
       a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
       a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
       a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
       a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
       a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
     }
}


..
Loop body:
  ...
  vrgatherei16.vv...
  ...

Tail:
  lbu     a4,792(a1)
         lbu     a5,792(a2)
         addw    a5,a5,a4
         sb      a5,792(a0)
         lbu     a5,793(a2)
         addw    a5,a5,a4
         sb      a5,793(a0)
         lbu     a4,794(a1)
         lbu     a5,794(a2)
         addw    a5,a5,a4
         sb      a5,794(a0)
         lbu     a5,795(a2)
         addw    a5,a5,a4
         sb      a5,795(a0)
         lbu     a4,796(a1)
         lbu     a5,796(a2)
         addw    a5,a5,a4
         sb      a5,796(a0)
         lbu     a5,797(a2)
         addw    a5,a5,a4
         sb      a5,797(a0)
         lbu     a4,798(a1)
         lbu     a5,798(a2)
         addw    a5,a5,a4
         sb      a5,798(a0)
         lbu     a5,799(a2)
         addw    a5,a5,a4
         sb      a5,799(a0)
         ret

The tail elements need VLS modes to vectorize like ARM SVE:

f:
         mov     x3, 0
         cntb    x5
         mov     x4, 792
         whilelo p7.b, xzr, x4
.L2:
         ld1b    z31.b, p7/z, [x1, x3]
         ld1b    z30.b, p7/z, [x2, x3]
         trn1    z31.b, z31.b, z31.b
         add     z31.b, z31.b, z30.b
         st1b    z31.b, p7, [x0, x3]
         add     x3, x3, x5
         whilelo p7.b, x3, x4
         b.any   .L2
Tail:
         ldr     b31, [x1, 792]
         ldr     b27, [x1, 794]
         ldr     b28, [x1, 796]
         dup     v31.8b, v31.b[0]
         ldr     b29, [x1, 798]
         ldr     d30, [x2, 792]
         ins     v31.b[2], v27.b[0]
         ins     v31.b[3], v27.b[0]
         ins     v31.b[4], v28.b[0]
         ins     v31.b[5], v28.b[0]
         ins     v31.b[6], v29.b[0]
         ins     v31.b[7], v29.b[0]
         add     v31.8b, v30.8b, v31.8b
         str     d31, [x0, 792]
         ret

Notice ARM SVE use ADVSIMD modes (Neon) to vectorize the tail.




gcc/ChangeLog:

         * config/riscv/riscv-modes.def (VECTOR_BOOL_MODE): Add VLS modes for 
GNU vectors.
         (ADJUST_ALIGNMENT): Ditto.
         (ADJUST_BYTESIZE): Ditto.

         (ADJUST_PRECISION): Ditto.
         (VECTOR_MODES): Ditto.
         * config/riscv/riscv-protos.h (riscv_v_ext_vls_mode_p): Ditto.
         (get_regno_alignment): Ditto.
         * config/riscv/riscv-v.cc (INCLUDE_ALGORITHM): Ditto.
         (const_vlmax_p): Ditto.
         (legitimize_move): Ditto.
         (get_vlmul): Ditto.
         (get_regno_alignment): Ditto.
         (get_ratio): Ditto.
         (get_vector_mode): Ditto.
         * config/riscv/riscv-vector-switch.def (VLS_ENTRY): Ditto.
         * config/riscv/riscv.cc (riscv_v_ext_vls_mode_p): Ditto.
         (VLS_ENTRY): Ditto.
         (riscv_v_ext_mode_p): Ditto.
         (riscv_hard_regno_nregs): Ditto.
         (riscv_hard_regno_mode_ok): Ditto.
         * config/riscv/riscv.md: Ditto.
         * config/riscv/vector-iterators.md: Ditto.
         * config/riscv/vector.md: Ditto.
         * config/riscv/autovec-vls.md: New file.

---
So I expected we were going to have to define some static length patterns at some point. So this isn't a huge surprise.






diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 6421e933ca9..6fc1c433069 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -25,6 +25,7 @@
     the vector.md.  */
  #define RVV_INSN_OPERANDS_MAX 11
+#define INCLUDE_ALGORITHM
I must have missed something in this patch. I didn't see anything obvious which needed INCLUDE_ALGORITHM.


  /* Return true if it is either RVV vector mode or RVV tuple mode.  */
static bool
  riscv_v_ext_mode_p (machine_mode mode)
  {
-  return riscv_v_ext_vector_mode_p (mode) || riscv_v_ext_tuple_mode_p (mode);
+  return riscv_v_ext_vector_mode_p (mode) || riscv_v_ext_tuple_mode_p (mode)
+        || riscv_v_ext_vls_mode_p (mode);
  }
Formatting nit. When you wrap a line, go ahead and add the parenthesis and indent the operator appropriately.

If you need INCLUDE_ALGORITHM, then fine, just assume I missed whatever caused you to need it. If not remove it. Fix the formatting nit and this is good for the trunk. Go ahead and post the V2, but consider it pre-approved.

jeff

Reply via email to