Re: [PATCH] RISC-V: Add VLS modes for GNU vectors

2023-06-19 Thread Jeff Law via Gcc-patches




On 6/19/23 15:45, 钟居哲 wrote:

Hi, Jeff.
Thanks for comment.

I add INCLUDE_ALGORITHM since I use std:min.
I failed to compile when I didn't add INCLUDE_ALGORITHM.

Is INCLUDE_ALGORITHM expensive that you don't want it?


It just stood out as unexpected.  THere's no concerns with std::min and 
the like.


Jeff



Re: Re: [PATCH] RISC-V: Add VLS modes for GNU vectors

2023-06-19 Thread 钟居哲
Hi, Jeff.
Thanks for comment.

I add INCLUDE_ALGORITHM since I use std:min.
I failed to compile when I didn't add INCLUDE_ALGORITHM.

Is INCLUDE_ALGORITHM expensive that you don't want it?


juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2023-06-20 02:25
To: Juzhe-Zhong; gcc-patches
CC: kito.cheng; palmer; rdapp.gcc
Subject: Re: [PATCH] RISC-V: Add VLS modes for GNU vectors
 
 
On 6/18/23 17:06, Juzhe-Zhong wrote:
> This patch is a propsal patch is **NOT** ready to push since
> after this patch the total machine modes will exceed 255 which will create ICE
> in LTO:
>internal compiler error: in bp_pack_int_in_range, at data-streamer.h:290
Right.  Note that an ack from Jakub or Richi will be sufficient for the 
LTO fixes to go forward.
 
 
> 
> The reason we need to add VLS modes for following reason:
> 1. Enhance GNU vectors codegen:
> For example:
>   typedef int32_t vnx8si __attribute__ ((vector_size (32)));
> 
>   __attribute__ ((noipa)) void
>   f_vnx8si (int32_t * in, int32_t * out)
>   {
> vnx8si v = *(vnx8si*)in;
> *(vnx8si *) out = v;
>   }
> 
> compile option: --param=riscv-autovec-preference=scalable
>  before this patch:
>  f_vnx8si:
>  ld  a2,0(a0)
>  ld  a3,8(a0)
>  ld  a4,16(a0)
>  ld  a5,24(a0)
>  addisp,sp,-32
>  sd  a2,0(a1)
>  sd  a3,8(a1)
>  sd  a4,16(a1)
>  sd  a5,24(a1)
>  addisp,sp,32
>  jr  ra
> 
> After this patch:
> f_vnx8si:
>  vsetivlizero,8,e32,m2,ta,ma
>  vle32.v v2,0(a0)
>  vse32.v v2,0(a1)
>  ret
> 
> 2. Ehance VLA SLP:
> void
> f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
> {
>for (int i = 0; i < 100; ++i)
>  {
>a[i * 8] = b[i * 8] + c[i * 8];
>a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
>a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
>a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
>a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
>a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
>a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
>a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
>  }
> }
> 
> 
> ..
> Loop body:
>   ...
>   vrgatherei16.vv...
>   ...
> 
> Tail:
>   lbu a4,792(a1)
>  lbu a5,792(a2)
>  addwa5,a5,a4
>  sb  a5,792(a0)
>  lbu a5,793(a2)
>  addwa5,a5,a4
>  sb  a5,793(a0)
>  lbu a4,794(a1)
>  lbu a5,794(a2)
>  addwa5,a5,a4
>  sb  a5,794(a0)
>  lbu a5,795(a2)
>  addwa5,a5,a4
>  sb  a5,795(a0)
>  lbu a4,796(a1)
>  lbu a5,796(a2)
>  addwa5,a5,a4
>  sb  a5,796(a0)
>  lbu a5,797(a2)
>  addwa5,a5,a4
>  sb  a5,797(a0)
>  lbu a4,798(a1)
>  lbu a5,798(a2)
>  addwa5,a5,a4
>  sb  a5,798(a0)
>  lbu a5,799(a2)
>  addwa5,a5,a4
>  sb  a5,799(a0)
>  ret
> 
> The tail elements need VLS modes to vectorize like ARM SVE:
> 
> f:
>  mov x3, 0
>  cntbx5
>  mov x4, 792
>  whilelo p7.b, xzr, x4
> .L2:
>  ld1bz31.b, p7/z, [x1, x3]
>  ld1bz30.b, p7/z, [x2, x3]
>  trn1z31.b, z31.b, z31.b
>  add z31.b, z31.b, z30.b
>  st1bz31.b, p7, [x0, x3]
>  add x3, x3, x5
>  whilelo p7.b, x3, x4
>  b.any   .L2
> Tail:
>  ldr b31, [x1, 792]
>  ldr b27, [x1, 794]
>  ldr b28, [x1, 796]
>  dup v31.8b, v31.b[0]
>  ldr b29, [x1, 798]
>  ldr d30, [x2, 792]
>  ins v31.b[2], v27.b[0]
>  ins v31.b[3], v27.b[0]
>  ins v31.b[4], v28.b[0]
>  ins v31.b[5], v28.b[0]
>  ins v31.b[6], v29.b[0]
>  ins v31.b[7], v29.b[0]
>  add v31.8b, v30.8b, v31.8b
>  str d31, [x0, 792]
>  ret
> 
> Notice ARM SVE use ADVSIMD modes (Neon) to vectorize the tail.
 
 
 
> 
> gcc/ChangeLog:
> 
>  * config/riscv/riscv-modes.def (VECTOR_BOOL_MODE): Add VLS modes for 
> GNU vectors.
>  (ADJUST_ALIGNMENT): Ditto.
>  (ADJUST_BYTESIZE): Ditto.
> 
>  (ADJUST_PRECISION): Ditto.
>  (VECTOR_MODES): Ditto.
>  * config/riscv/riscv-protos.h (riscv_v_ext_vls_mode_p):

Re: [PATCH] RISC-V: Add VLS modes for GNU vectors

2023-06-19 Thread Jeff Law via Gcc-patches




On 6/18/23 17:06, Juzhe-Zhong wrote:

This patch is a propsal patch is **NOT** ready to push since
after this patch the total machine modes will exceed 255 which will create ICE
in LTO:
   internal compiler error: in bp_pack_int_in_range, at data-streamer.h:290
Right.  Note that an ack from Jakub or Richi will be sufficient for the 
LTO fixes to go forward.





The reason we need to add VLS modes for following reason:
1. Enhance GNU vectors codegen:
For example:
  typedef int32_t vnx8si __attribute__ ((vector_size (32)));

  __attribute__ ((noipa)) void
  f_vnx8si (int32_t * in, int32_t * out)
  {
vnx8si v = *(vnx8si*)in;
*(vnx8si *) out = v;
  }

compile option: --param=riscv-autovec-preference=scalable
 before this patch:
 f_vnx8si:
 ld  a2,0(a0)
 ld  a3,8(a0)
 ld  a4,16(a0)
 ld  a5,24(a0)
 addisp,sp,-32
 sd  a2,0(a1)
 sd  a3,8(a1)
 sd  a4,16(a1)
 sd  a5,24(a1)
 addisp,sp,32
 jr  ra

After this patch:
f_vnx8si:
 vsetivlizero,8,e32,m2,ta,ma
 vle32.v v2,0(a0)
 vse32.v v2,0(a1)
 ret

2. Ehance VLA SLP:
void
f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
{
   for (int i = 0; i < 100; ++i)
 {
   a[i * 8] = b[i * 8] + c[i * 8];
   a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
   a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
   a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
   a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
   a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
   a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
   a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
 }
}


..
Loop body:
  ...
  vrgatherei16.vv...
  ...

Tail:
  lbu a4,792(a1)
 lbu a5,792(a2)
 addwa5,a5,a4
 sb  a5,792(a0)
 lbu a5,793(a2)
 addwa5,a5,a4
 sb  a5,793(a0)
 lbu a4,794(a1)
 lbu a5,794(a2)
 addwa5,a5,a4
 sb  a5,794(a0)
 lbu a5,795(a2)
 addwa5,a5,a4
 sb  a5,795(a0)
 lbu a4,796(a1)
 lbu a5,796(a2)
 addwa5,a5,a4
 sb  a5,796(a0)
 lbu a5,797(a2)
 addwa5,a5,a4
 sb  a5,797(a0)
 lbu a4,798(a1)
 lbu a5,798(a2)
 addwa5,a5,a4
 sb  a5,798(a0)
 lbu a5,799(a2)
 addwa5,a5,a4
 sb  a5,799(a0)
 ret

The tail elements need VLS modes to vectorize like ARM SVE:

f:
 mov x3, 0
 cntbx5
 mov x4, 792
 whilelo p7.b, xzr, x4
.L2:
 ld1bz31.b, p7/z, [x1, x3]
 ld1bz30.b, p7/z, [x2, x3]
 trn1z31.b, z31.b, z31.b
 add z31.b, z31.b, z30.b
 st1bz31.b, p7, [x0, x3]
 add x3, x3, x5
 whilelo p7.b, x3, x4
 b.any   .L2
Tail:
 ldr b31, [x1, 792]
 ldr b27, [x1, 794]
 ldr b28, [x1, 796]
 dup v31.8b, v31.b[0]
 ldr b29, [x1, 798]
 ldr d30, [x2, 792]
 ins v31.b[2], v27.b[0]
 ins v31.b[3], v27.b[0]
 ins v31.b[4], v28.b[0]
 ins v31.b[5], v28.b[0]
 ins v31.b[6], v29.b[0]
 ins v31.b[7], v29.b[0]
 add v31.8b, v30.8b, v31.8b
 str d31, [x0, 792]
 ret

Notice ARM SVE use ADVSIMD modes (Neon) to vectorize the tail.






gcc/ChangeLog:

 * config/riscv/riscv-modes.def (VECTOR_BOOL_MODE): Add VLS modes for 
GNU vectors.
 (ADJUST_ALIGNMENT): Ditto.
 (ADJUST_BYTESIZE): Ditto.

 (ADJUST_PRECISION): Ditto.
 (VECTOR_MODES): Ditto.
 * config/riscv/riscv-protos.h (riscv_v_ext_vls_mode_p): Ditto.
 (get_regno_alignment): Ditto.
 * config/riscv/riscv-v.cc (INCLUDE_ALGORITHM): Ditto.
 (const_vlmax_p): Ditto.
 (legitimize_move): Ditto.
 (get_vlmul): Ditto.
 (get_regno_alignment): Ditto.
 (get_ratio): Ditto.
 (get_vector_mode): Ditto.
 * config/riscv/riscv-vector-switch.def (VLS_ENTRY): Ditto.
 * config/riscv/riscv.cc (riscv_v_ext_vls_mode_p): Ditto.
 (VLS_ENTRY): Ditto.
 (riscv_v_ext_mode_p): Ditto.
 (riscv_hard_regno_nregs): Ditto.
 (riscv_hard_regno_mode_ok): Ditto.
 * config/riscv/riscv.md: Ditto.
 * config/riscv/vector-iterators.md: Ditto.
 * config/riscv/vector.md: Ditto.
 * config/riscv/autovec-vls.md: New file.

---
So I expected we were going to have to define some static length 
patterns at some point.  So this isn't a huge surprise.








diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 6421e933ca9..6fc1c433069 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/ri

[PATCH] RISC-V: Add VLS modes for GNU vectors

2023-06-18 Thread Juzhe-Zhong
This patch is a propsal patch is **NOT** ready to push since
after this patch the total machine modes will exceed 255 which will create ICE
in LTO:
  internal compiler error: in bp_pack_int_in_range, at data-streamer.h:290

The reason we need to add VLS modes for following reason:
1. Enhance GNU vectors codegen:
   For example:
 typedef int32_t vnx8si __attribute__ ((vector_size (32)));

 __attribute__ ((noipa)) void
 f_vnx8si (int32_t * in, int32_t * out)
 {
   vnx8si v = *(vnx8si*)in;
   *(vnx8si *) out = v;
 } 

compile option: --param=riscv-autovec-preference=scalable
before this patch:
f_vnx8si:
ld  a2,0(a0)
ld  a3,8(a0)
ld  a4,16(a0)
ld  a5,24(a0)
addisp,sp,-32
sd  a2,0(a1)
sd  a3,8(a1)
sd  a4,16(a1)
sd  a5,24(a1)
addisp,sp,32
jr  ra

After this patch:
   f_vnx8si:
vsetivlizero,8,e32,m2,ta,ma
vle32.v v2,0(a0)
vse32.v v2,0(a1)
ret

2. Ehance VLA SLP:
void
f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
{
  for (int i = 0; i < 100; ++i)
{
  a[i * 8] = b[i * 8] + c[i * 8];
  a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
  a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
  a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
  a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
  a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
  a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
  a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
}
}


..
Loop body:
 ...
 vrgatherei16.vv...
 ...

Tail:
 lbu a4,792(a1)
lbu a5,792(a2)
addwa5,a5,a4
sb  a5,792(a0)
lbu a5,793(a2)
addwa5,a5,a4
sb  a5,793(a0)
lbu a4,794(a1)
lbu a5,794(a2)
addwa5,a5,a4
sb  a5,794(a0)
lbu a5,795(a2)
addwa5,a5,a4
sb  a5,795(a0)
lbu a4,796(a1)
lbu a5,796(a2)
addwa5,a5,a4
sb  a5,796(a0)
lbu a5,797(a2)
addwa5,a5,a4
sb  a5,797(a0)
lbu a4,798(a1)
lbu a5,798(a2)
addwa5,a5,a4
sb  a5,798(a0)
lbu a5,799(a2)
addwa5,a5,a4
sb  a5,799(a0)
ret

The tail elements need VLS modes to vectorize like ARM SVE:

f:
mov x3, 0
cntbx5
mov x4, 792
whilelo p7.b, xzr, x4
.L2:
ld1bz31.b, p7/z, [x1, x3]
ld1bz30.b, p7/z, [x2, x3]
trn1z31.b, z31.b, z31.b
add z31.b, z31.b, z30.b
st1bz31.b, p7, [x0, x3]
add x3, x3, x5
whilelo p7.b, x3, x4
b.any   .L2
Tail:
ldr b31, [x1, 792]
ldr b27, [x1, 794]
ldr b28, [x1, 796]
dup v31.8b, v31.b[0]
ldr b29, [x1, 798]
ldr d30, [x2, 792]
ins v31.b[2], v27.b[0]
ins v31.b[3], v27.b[0]
ins v31.b[4], v28.b[0]
ins v31.b[5], v28.b[0]
ins v31.b[6], v29.b[0]
ins v31.b[7], v29.b[0]
add v31.8b, v30.8b, v31.8b
str d31, [x0, 792]
ret

Notice ARM SVE use ADVSIMD modes (Neon) to vectorize the tail.

gcc/ChangeLog:

* config/riscv/riscv-modes.def (VECTOR_BOOL_MODE): Add VLS modes for 
GNU vectors.
(ADJUST_ALIGNMENT): Ditto.
(ADJUST_BYTESIZE): Ditto.

(ADJUST_PRECISION): Ditto.
(VECTOR_MODES): Ditto.
* config/riscv/riscv-protos.h (riscv_v_ext_vls_mode_p): Ditto.
(get_regno_alignment): Ditto.
* config/riscv/riscv-v.cc (INCLUDE_ALGORITHM): Ditto.
(const_vlmax_p): Ditto.
(legitimize_move): Ditto.
(get_vlmul): Ditto.
(get_regno_alignment): Ditto.
(get_ratio): Ditto.
(get_vector_mode): Ditto.
* config/riscv/riscv-vector-switch.def (VLS_ENTRY): Ditto.
* config/riscv/riscv.cc (riscv_v_ext_vls_mode_p): Ditto.
(VLS_ENTRY): Ditto.
(riscv_v_ext_mode_p): Ditto.
(riscv_hard_regno_nregs): Ditto.
(riscv_hard_regno_mode_ok): Ditto.
* config/riscv/riscv.md: Ditto.
* config/riscv/vector-iterators.md: Ditto.
* config/riscv/vector.md: Ditto.
* config/riscv/autovec-vls.md: New file.

---
 gcc/config/riscv/autovec-vls.md  | 102 +++
 gcc/config/riscv/riscv-modes.def |  72 +
 gcc/config/riscv/riscv-protos.h  |   2 +
 gcc/config/riscv/riscv-v.cc  | 122 ++-
 gcc/config/riscv/riscv-vector-switch.def |  62 
 gcc/config/riscv/riscv.cc|  45 ++---
 gcc/config/riscv/riscv.md|   6 +-
 gcc/config/riscv/vector-iterators.md |  57 +++
 gcc/config/riscv/vector.md   | 113 +++--
 9 files change