Re: [PATCH 00/10] vect: Reuse reduction accumulators between loops

2021-07-13 Thread Richard Sandiford via Gcc-patches
"Kewen.Lin"  writes:
> Hi Richard,
>
> on 2021/7/8 下午8:38, Richard Sandiford via Gcc-patches wrote:
>> Quoting from the final patch in the series:
>> 
>> 
>> This patch adds support for reusing a main loop's reduction accumulator
>> in an epilogue loop.  This in turn lets the loops share a single piece
>> of vector->scalar reduction code.
>> 
>> The patch has the following restrictions:
>> 
>> (1) The epilogue reduction can only operate on a single vector
>> (e.g. ncopies must be 1 for non-SLP reductions, and the group size
>> must be <= the element count for SLP reductions).
>> 
>> (2) Both loops must use the same vector mode for their accumulators.
>> This means that the patch is restricted to targets that support
>> --param vect-partial-vector-usage=1.
>> 
>> (3) The reduction must be a standard “tree code” reduction.
>> 
>> However, these restrictions could be lifted in future.  For example,
>> if the main loop operates on 128-bit vectors and the epilogue loop
>> operates on 64-bit vectors, we could in future reduce the 128-bit
>> vector by one stage and use the 64-bit result as the starting point
>> for the epilogue result.
>> 
>> The patch tries to handle chained SLP reductions, unchained SLP
>> reductions and non-SLP reductions.  It also handles cases in which
>> the epilogue loop is entered directly (rather than via the main loop)
>> and cases in which the epilogue loop can be skipped.
>> 
>> 
>> However, it ended up being difficult to do that without some preparatory
>> clean-ups.  Some of them could probably stand on their own, but others
>> are a bit “meh” without the final patch to justify them.
>> 
>> The diff below shows the effect of the patch when compiling:
>> 
>>   unsigned short __attribute__((noipa))
>>   add_loop (unsigned short *x, int n)
>>   {
>> unsigned short res = 0;
>> for (int i = 0; i < n; ++i)
>>   res += x[i];
>> return res;
>>   }
>> 
>> with -O3 --param vect-partial-vector-usage=1 on an SVE target:
>> 
>> add_loop:add_loop:
>> .LFB0:   .LFB0:
>>  .cfi_startproc  .cfi_startproc
>>  mov x4, x0<
>>  cmp w1, 0   cmp w1, 0
>>  ble .L7 ble .L7
>>  cnthx0| cnthx4
>>  sub w2, w1, #1  sub w2, w1, #1
>>  sub w3, w0, #1| sub w3, w4, #1
>>  cmp w2, w3  cmp w2, w3
>>  bcc .L8 bcc .L8
>>  sub w0, w1, w0| sub w4, w1, w4
>>  mov x3, 0   mov x3, 0
>>  cnthx5  cnthx5
>>  mov z0.b, #0mov z0.b, #0
>>  ptrue   p0.b, all   ptrue   p0.b, all
>>  .p2align 3,,7   .p2align 3,,7
>> .L4: .L4:
>>  ld1hz1.h, p0/z, [x4, x3,  | ld1hz1.h, p0/z, [x0, x3, 
>>  mov x2, x3  mov x2, x3
>>  add x3, x3, x5  add x3, x3, x5
>>  add z0.h, z0.h, z1.hadd z0.h, z0.h, z1.h
>>  cmp w0, w3| cmp w4, w3
>>  bcs .L4 bcs .L4
>>  uaddv   d0, p0, z0.h  <
>>  umovw0, v0.h[0]   <
>>  inchx2  inchx2
>>  and w0, w0, 65535 <
>>  cmp w1, w2  cmp w1, w2
>>  beq .L2   | beq .L6
>> .L3: .L3:
>>  sub w1, w1, w2  sub w1, w1, w2
>>  mov z1.b, #0  | add x2, x0, w2, uxtw 1
>>  whilelo p0.h, wzr, w1   whilelo p0.h, wzr, w1
>>  add x2, x4, w2, uxtw 1| ld1hz1.h, p0/z, [x2]
>>  ptrue   p1.b, all | add z0.h, p0/m, z0.h, z1.
>>  ld1hz0.h, p0/z, [x2]  | .L6:
>>  sel z0.h, p0, z0.h, z1.h  | ptrue   p0.b, all
>>  uaddv   d0, p1, z0.h  | uaddv   d0, p0, z0.h
>>  fmovx1, d0| umovw0, v0.h[0]
>>  add w0, w0, w1, uxth  <
>>  and w0, w0, 65535   and w0, w0, 65535
>> .L2:   <
>>  ret ret
>>  .p2align 2,,3   .p2align 2,,3
>> .L7: .L7:
>>  mov w0, 0   mov w0, 0
>>  ret 

Re: [PATCH 00/10] vect: Reuse reduction accumulators between loops

2021-07-09 Thread Kewen.Lin via Gcc-patches
Hi Richard,

on 2021/7/8 下午8:38, Richard Sandiford via Gcc-patches wrote:
> Quoting from the final patch in the series:
> 
> 
> This patch adds support for reusing a main loop's reduction accumulator
> in an epilogue loop.  This in turn lets the loops share a single piece
> of vector->scalar reduction code.
> 
> The patch has the following restrictions:
> 
> (1) The epilogue reduction can only operate on a single vector
> (e.g. ncopies must be 1 for non-SLP reductions, and the group size
> must be <= the element count for SLP reductions).
> 
> (2) Both loops must use the same vector mode for their accumulators.
> This means that the patch is restricted to targets that support
> --param vect-partial-vector-usage=1.
> 
> (3) The reduction must be a standard “tree code” reduction.
> 
> However, these restrictions could be lifted in future.  For example,
> if the main loop operates on 128-bit vectors and the epilogue loop
> operates on 64-bit vectors, we could in future reduce the 128-bit
> vector by one stage and use the 64-bit result as the starting point
> for the epilogue result.
> 
> The patch tries to handle chained SLP reductions, unchained SLP
> reductions and non-SLP reductions.  It also handles cases in which
> the epilogue loop is entered directly (rather than via the main loop)
> and cases in which the epilogue loop can be skipped.
> 
> 
> However, it ended up being difficult to do that without some preparatory
> clean-ups.  Some of them could probably stand on their own, but others
> are a bit “meh” without the final patch to justify them.
> 
> The diff below shows the effect of the patch when compiling:
> 
>   unsigned short __attribute__((noipa))
>   add_loop (unsigned short *x, int n)
>   {
> unsigned short res = 0;
> for (int i = 0; i < n; ++i)
>   res += x[i];
> return res;
>   }
> 
> with -O3 --param vect-partial-vector-usage=1 on an SVE target:
> 
> add_loop: add_loop:
> .LFB0:.LFB0:
>   .cfi_startproc  .cfi_startproc
>   mov x4, x0<
>   cmp w1, 0   cmp w1, 0
>   ble .L7 ble .L7
>   cnthx0| cnthx4
>   sub w2, w1, #1  sub w2, w1, #1
>   sub w3, w0, #1| sub w3, w4, #1
>   cmp w2, w3  cmp w2, w3
>   bcc .L8 bcc .L8
>   sub w0, w1, w0| sub w4, w1, w4
>   mov x3, 0   mov x3, 0
>   cnthx5  cnthx5
>   mov z0.b, #0mov z0.b, #0
>   ptrue   p0.b, all   ptrue   p0.b, all
>   .p2align 3,,7   .p2align 3,,7
> .L4:  .L4:
>   ld1hz1.h, p0/z, [x4, x3,  | ld1hz1.h, p0/z, [x0, x3, 
>   mov x2, x3  mov x2, x3
>   add x3, x3, x5  add x3, x3, x5
>   add z0.h, z0.h, z1.hadd z0.h, z0.h, z1.h
>   cmp w0, w3| cmp w4, w3
>   bcs .L4 bcs .L4
>   uaddv   d0, p0, z0.h  <
>   umovw0, v0.h[0]   <
>   inchx2  inchx2
>   and w0, w0, 65535 <
>   cmp w1, w2  cmp w1, w2
>   beq .L2   | beq .L6
> .L3:  .L3:
>   sub w1, w1, w2  sub w1, w1, w2
>   mov z1.b, #0  | add x2, x0, w2, uxtw 1
>   whilelo p0.h, wzr, w1   whilelo p0.h, wzr, w1
>   add x2, x4, w2, uxtw 1| ld1hz1.h, p0/z, [x2]
>   ptrue   p1.b, all | add z0.h, p0/m, z0.h, z1.
>   ld1hz0.h, p0/z, [x2]  | .L6:
>   sel z0.h, p0, z0.h, z1.h  | ptrue   p0.b, all
>   uaddv   d0, p1, z0.h  | uaddv   d0, p0, z0.h
>   fmovx1, d0| umovw0, v0.h[0]
>   add w0, w0, w1, uxth  <
>   and w0, w0, 65535   and w0, w0, 65535
> .L2:<
>   ret ret
>   .p2align 2,,3   .p2align 2,,3
> .L7:  .L7:
>   mov w0, 0   mov w0, 0
>   ret ret
> .L8:  .L8:
>   mov w2, 0

[PATCH 00/10] vect: Reuse reduction accumulators between loops

2021-07-08 Thread Richard Sandiford via Gcc-patches
Quoting from the final patch in the series:


This patch adds support for reusing a main loop's reduction accumulator
in an epilogue loop.  This in turn lets the loops share a single piece
of vector->scalar reduction code.

The patch has the following restrictions:

(1) The epilogue reduction can only operate on a single vector
(e.g. ncopies must be 1 for non-SLP reductions, and the group size
must be <= the element count for SLP reductions).

(2) Both loops must use the same vector mode for their accumulators.
This means that the patch is restricted to targets that support
--param vect-partial-vector-usage=1.

(3) The reduction must be a standard “tree code” reduction.

However, these restrictions could be lifted in future.  For example,
if the main loop operates on 128-bit vectors and the epilogue loop
operates on 64-bit vectors, we could in future reduce the 128-bit
vector by one stage and use the 64-bit result as the starting point
for the epilogue result.

The patch tries to handle chained SLP reductions, unchained SLP
reductions and non-SLP reductions.  It also handles cases in which
the epilogue loop is entered directly (rather than via the main loop)
and cases in which the epilogue loop can be skipped.


However, it ended up being difficult to do that without some preparatory
clean-ups.  Some of them could probably stand on their own, but others
are a bit “meh” without the final patch to justify them.

The diff below shows the effect of the patch when compiling:

  unsigned short __attribute__((noipa))
  add_loop (unsigned short *x, int n)
  {
unsigned short res = 0;
for (int i = 0; i < n; ++i)
  res += x[i];
return res;
  }

with -O3 --param vect-partial-vector-usage=1 on an SVE target:

add_loop:   add_loop:
.LFB0:  .LFB0:
.cfi_startproc  .cfi_startproc
mov x4, x0<
cmp w1, 0   cmp w1, 0
ble .L7 ble .L7
cnthx0| cnthx4
sub w2, w1, #1  sub w2, w1, #1
sub w3, w0, #1| sub w3, w4, #1
cmp w2, w3  cmp w2, w3
bcc .L8 bcc .L8
sub w0, w1, w0| sub w4, w1, w4
mov x3, 0   mov x3, 0
cnthx5  cnthx5
mov z0.b, #0mov z0.b, #0
ptrue   p0.b, all   ptrue   p0.b, all
.p2align 3,,7   .p2align 3,,7
.L4:.L4:
ld1hz1.h, p0/z, [x4, x3,  | ld1hz1.h, p0/z, [x0, x3, 
mov x2, x3  mov x2, x3
add x3, x3, x5  add x3, x3, x5
add z0.h, z0.h, z1.hadd z0.h, z0.h, z1.h
cmp w0, w3| cmp w4, w3
bcs .L4 bcs .L4
uaddv   d0, p0, z0.h  <
umovw0, v0.h[0]   <
inchx2  inchx2
and w0, w0, 65535 <
cmp w1, w2  cmp w1, w2
beq .L2   | beq .L6
.L3:.L3:
sub w1, w1, w2  sub w1, w1, w2
mov z1.b, #0  | add x2, x0, w2, uxtw 1
whilelo p0.h, wzr, w1   whilelo p0.h, wzr, w1
add x2, x4, w2, uxtw 1| ld1hz1.h, p0/z, [x2]
ptrue   p1.b, all | add z0.h, p0/m, z0.h, z1.
ld1hz0.h, p0/z, [x2]  | .L6:
sel z0.h, p0, z0.h, z1.h  | ptrue   p0.b, all
uaddv   d0, p1, z0.h  | uaddv   d0, p0, z0.h
fmovx1, d0| umovw0, v0.h[0]
add w0, w0, w1, uxth  <
and w0, w0, 65535   and w0, w0, 65535
.L2:  <
ret ret
.p2align 2,,3   .p2align 2,,3
.L7:.L7:
mov w0, 0   mov w0, 0
ret ret
.L8:.L8:
mov w2, 0   mov w2, 0
mov w0, 0 | mov z0.b, #0
b   .L3 b   .L3