[PATCH] VECT: Apply MASK_LEN_{LOAD_LANES, STORE_LANES} into vectorizer

2023-08-13 Thread juzhe . zhong
From: Ju-Zhe Zhong 

Hi, Richard and Richi.

This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into vectorizer.

Consider this simple case:

void __attribute__ ((noinline, noclone))
foo (int *__restrict a, int *__restrict b, int *__restrict c,
  int *__restrict d, int *__restrict e, int *__restrict f,
  int *__restrict g, int *__restrict h, int *__restrict j, int n)
{
  for (int i = 0; i < n; ++i)
{
  a[i] = j[i * 8];
  b[i] = j[i * 8 + 1];
  c[i] = j[i * 8 + 2];
  d[i] = j[i * 8 + 3];
  e[i] = j[i * 8 + 4];
  f[i] = j[i * 8 + 5];
  g[i] = j[i * 8 + 6];
  h[i] = j[i * 8 + 7];
}
}

RVV Gimple IR:

  _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
  ivtmp_125 = _79 * 32;
  vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 0);
  vect__8.9_122 = vect_array.8[0];
  vect__8.10_121 = vect_array.8[1];
  vect__8.11_120 = vect_array.8[2];
  vect__8.12_119 = vect_array.8[3];
  vect__8.13_118 = vect_array.8[4];
  vect__8.14_117 = vect_array.8[5];
  vect__8.15_116 = vect_array.8[6];
  vect__8.16_115 = vect_array.8[7];
  vect_array.8 ={v} {CLOBBER};
  ivtmp_114 = _79 * 4;
  .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
  .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121);
  .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120);
  .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119);
  .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
  .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
  .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
  .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);

ASM:

foo:
lw  t4,8(sp)
ld  t5,0(sp)
ble t4,zero,.L5
.L3:
vsetvli t1,t4,e8,mf4,ta,ma
vlseg8e32.v v8,(t5)
sllit3,t1,2
sllit6,t1,5
vse32.v v8,0(a0)
vse32.v v9,0(a1)
vse32.v v10,0(a2)
vse32.v v11,0(a3)
vse32.v v12,0(a4)
vse32.v v13,0(a5)
vse32.v v14,0(a6)
vse32.v v15,0(a7)
sub t4,t4,t1
add t5,t5,t6
add a0,a0,t3
add a1,a1,t3
add a2,a2,t3
add a3,a3,t3
add a4,a4,t3
add a5,a5,t3
add a6,a6,t3
add a7,a7,t3
bne t4,zero,.L3
.L5:
ret

The details of the approach:

Step 1 - Modifiy the LANES LOAD/STORE support function 
(vect_load_lanes_supported/vect_store_lanes_supported):

+/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
+   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. 
*/
 
-bool
+internal_fn
 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
   bool masked_p)
 {
-  if (masked_p)
-return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
-vec_mask_load_lanes_optab,
-vectype, count);
+  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
+   vec_mask_len_load_lanes_optab,
+   vectype, count))
+return IFN_MASK_LEN_LOAD_LANES;
+  else if (masked_p)
+{
+  if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
+   vec_mask_load_lanes_optab,
+   vectype, count))
+   return IFN_MASK_LOAD_LANES;
+}
   else
-return vect_lanes_optab_supported_p ("vec_load_lanes",
-vec_load_lanes_optab,
-vectype, count);
+{
+  if (vect_lanes_optab_supported_p ("vec_load_lanes",
+   vec_load_lanes_optab,
+   vectype, count))
+   return IFN_LOAD_LANES;
+}
+  return IFN_LAST;
 }
 
Instead of returning TRUE or FALSE whether target support the LANES LOAD/STORE.
I change it into return internal_fn of the LANES LOAD/STORE that target support,
If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST.

Step 2 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR:

+ if (vect_store_lanes_supported (vectype, group_size, false)
+ == IFN_MASK_LEN_STORE_LANES)
+   {
+ if (loop_lens)
+   final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+  ncopies, vectype, j, 1);
+ else
+   final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
+ signed char biasval
+   = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+ bias = build_int_cst (intQI_type_node, biasval);
+ if (!final_mask)
+   {
+ mask_vectype = t

Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer

2023-08-15 Thread Richard Biener via Gcc-patches
On Mon, 14 Aug 2023, juzhe.zh...@rivai.ai wrote:

> From: Ju-Zhe Zhong 
> 
> Hi, Richard and Richi.
> 
> This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into 
> vectorizer.
> 
> Consider this simple case:
> 
> void __attribute__ ((noinline, noclone))
> foo (int *__restrict a, int *__restrict b, int *__restrict c,
> int *__restrict d, int *__restrict e, int *__restrict f,
> int *__restrict g, int *__restrict h, int *__restrict j, int n)
> {
>   for (int i = 0; i < n; ++i)
> {
>   a[i] = j[i * 8];
>   b[i] = j[i * 8 + 1];
>   c[i] = j[i * 8 + 2];
>   d[i] = j[i * 8 + 3];
>   e[i] = j[i * 8 + 4];
>   f[i] = j[i * 8 + 5];
>   g[i] = j[i * 8 + 6];
>   h[i] = j[i * 8 + 7];
> }
> }
> 
> RVV Gimple IR:
> 
>   _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
>   ivtmp_125 = _79 * 32;
>   vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 
> 0);
>   vect__8.9_122 = vect_array.8[0];
>   vect__8.10_121 = vect_array.8[1];
>   vect__8.11_120 = vect_array.8[2];
>   vect__8.12_119 = vect_array.8[3];
>   vect__8.13_118 = vect_array.8[4];
>   vect__8.14_117 = vect_array.8[5];
>   vect__8.15_116 = vect_array.8[6];
>   vect__8.16_115 = vect_array.8[7];
>   vect_array.8 ={v} {CLOBBER};
>   ivtmp_114 = _79 * 4;
>   .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
>   .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121);
>   .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120);
>   .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119);
>   .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
>   .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
>   .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
>   .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);
> 
> ASM:
> 
> foo:
>   lw  t4,8(sp)
>   ld  t5,0(sp)
>   ble t4,zero,.L5
> .L3:
>   vsetvli t1,t4,e8,mf4,ta,ma
>   vlseg8e32.v v8,(t5)
>   sllit3,t1,2
>   sllit6,t1,5
>   vse32.v v8,0(a0)
>   vse32.v v9,0(a1)
>   vse32.v v10,0(a2)
>   vse32.v v11,0(a3)
>   vse32.v v12,0(a4)
>   vse32.v v13,0(a5)
>   vse32.v v14,0(a6)
>   vse32.v v15,0(a7)
>   sub t4,t4,t1
>   add t5,t5,t6
>   add a0,a0,t3
>   add a1,a1,t3
>   add a2,a2,t3
>   add a3,a3,t3
>   add a4,a4,t3
>   add a5,a5,t3
>   add a6,a6,t3
>   add a7,a7,t3
>   bne t4,zero,.L3
> .L5:
>   ret
> 
> The details of the approach:
> 
> Step 1 - Modifiy the LANES LOAD/STORE support function 
> (vect_load_lanes_supported/vect_store_lanes_supported):
> 
> +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> +   vectors of type VECTYPE.  MASKED_P says whether the masked form is 
> needed. */
>  
> -bool
> +internal_fn
>  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
>  bool masked_p)
>  {
> -  if (masked_p)
> -return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> -  vec_mask_load_lanes_optab,
> -  vectype, count);
> +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> + vec_mask_len_load_lanes_optab,
> + vectype, count))
> +return IFN_MASK_LEN_LOAD_LANES;
> +  else if (masked_p)
> +{
> +  if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> + vec_mask_load_lanes_optab,
> + vectype, count))
> + return IFN_MASK_LOAD_LANES;
> +}
>else
> -return vect_lanes_optab_supported_p ("vec_load_lanes",
> -  vec_load_lanes_optab,
> -  vectype, count);
> +{
> +  if (vect_lanes_optab_supported_p ("vec_load_lanes",
> + vec_load_lanes_optab,
> + vectype, count))
> + return IFN_LOAD_LANES;
> +}
> +  return IFN_LAST;
>  }
>  
> Instead of returning TRUE or FALSE whether target support the LANES 
> LOAD/STORE.
> I change it into return internal_fn of the LANES LOAD/STORE that target 
> support,
> If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST.
> 
> Step 2 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR:
> 
> +   if (vect_store_lanes_supported (vectype, group_size, false)
> +   == IFN_MASK_LEN_STORE_LANES)
> + {
> +   if (loop_lens)
> + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +ncopies, vectype, j, 1);
> +   else
> + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +

Re: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES, STORE_LANES} into vectorizer

2023-08-15 Thread juzhe.zh...@rivai.ai
Hi, Richi.

> +   if (vect_store_lanes_supported (vectype, group_size, false)
> +   == IFN_MASK_LEN_STORE_LANES)

>> can you use the previously computed 'ifn' here please?

Do you mean rewrite the codes as follows :?

internal_fn lanes_ifn = vect_store_lanes_supported (vectype, group_size, false);

if (lanes_ifn == IFN_MASK_LEN_STORE_LANES).

>> I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.

Yeah, working on it and I will test on both X86 and ARM.

Thanks.


juzhe.zh...@rivai.ai
 
From: Richard Biener
Date: 2023-08-15 17:40
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into 
vectorizer
On Mon, 14 Aug 2023, juzhe.zh...@rivai.ai wrote:
 
> From: Ju-Zhe Zhong 
> 
> Hi, Richard and Richi.
> 
> This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into 
> vectorizer.
> 
> Consider this simple case:
> 
> void __attribute__ ((noinline, noclone))
> foo (int *__restrict a, int *__restrict b, int *__restrict c,
>   int *__restrict d, int *__restrict e, int *__restrict f,
>   int *__restrict g, int *__restrict h, int *__restrict j, int n)
> {
>   for (int i = 0; i < n; ++i)
> {
>   a[i] = j[i * 8];
>   b[i] = j[i * 8 + 1];
>   c[i] = j[i * 8 + 2];
>   d[i] = j[i * 8 + 3];
>   e[i] = j[i * 8 + 4];
>   f[i] = j[i * 8 + 5];
>   g[i] = j[i * 8 + 6];
>   h[i] = j[i * 8 + 7];
> }
> }
> 
> RVV Gimple IR:
> 
>   _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
>   ivtmp_125 = _79 * 32;
>   vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 
> 0);
>   vect__8.9_122 = vect_array.8[0];
>   vect__8.10_121 = vect_array.8[1];
>   vect__8.11_120 = vect_array.8[2];
>   vect__8.12_119 = vect_array.8[3];
>   vect__8.13_118 = vect_array.8[4];
>   vect__8.14_117 = vect_array.8[5];
>   vect__8.15_116 = vect_array.8[6];
>   vect__8.16_115 = vect_array.8[7];
>   vect_array.8 ={v} {CLOBBER};
>   ivtmp_114 = _79 * 4;
>   .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
>   .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121);
>   .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120);
>   .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119);
>   .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
>   .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
>   .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
>   .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);
> 
> ASM:
> 
> foo:
> lw t4,8(sp)
> ld t5,0(sp)
> ble t4,zero,.L5
> .L3:
> vsetvli t1,t4,e8,mf4,ta,ma
> vlseg8e32.v v8,(t5)
> slli t3,t1,2
> slli t6,t1,5
> vse32.v v8,0(a0)
> vse32.v v9,0(a1)
> vse32.v v10,0(a2)
> vse32.v v11,0(a3)
> vse32.v v12,0(a4)
> vse32.v v13,0(a5)
> vse32.v v14,0(a6)
> vse32.v v15,0(a7)
> sub t4,t4,t1
> add t5,t5,t6
> add a0,a0,t3
> add a1,a1,t3
> add a2,a2,t3
> add a3,a3,t3
> add a4,a4,t3
> add a5,a5,t3
> add a6,a6,t3
> add a7,a7,t3
> bne t4,zero,.L3
> .L5:
> ret
> 
> The details of the approach:
> 
> Step 1 - Modifiy the LANES LOAD/STORE support function 
> (vect_load_lanes_supported/vect_store_lanes_supported):
> 
> +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> +   vectors of type VECTYPE.  MASKED_P says whether the masked form is 
> needed. */
>  
> -bool
> +internal_fn
>  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
> bool masked_p)
>  {
> -  if (masked_p)
> -return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> - vec_mask_load_lanes_optab,
> - vectype, count);
> +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> + vec_mask_len_load_lanes_optab,
> + vectype, count))
> +return IFN_MASK_LEN_LOAD_LANES;
> +  else if (masked_p)
> +{
> +  if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> + vec_mask_load_lanes_optab,
> + vectype, count))
> + return IFN_MASK_LOAD_LANES;
> +}
>else
> -return vect_lanes_optab_supported_p ("vec_load_lanes",
> - vec_load_lanes_optab,
> - vectype, count);
> +{
> +  if (vect_lanes_optab_supported_p ("vec_load_lanes",
> + vec_load_lanes_optab,
> + vectype, count))
> + return IFN_LOAD_LANES;
> +}
> +  return IFN_LAST;
>  }
>  
> Instead of returning TRUE or FALSE whether target support the LANES 
> LOAD/STORE.
> I change it into return internal_fn of the LANES LOAD/STORE th

Re: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer

2023-08-15 Thread Richard Biener via Gcc-patches
On Tue, 15 Aug 2023, juzhe.zh...@rivai.ai wrote:

> Hi, Richi.
> 
> > + if (vect_store_lanes_supported (vectype, group_size, false)
> > + == IFN_MASK_LEN_STORE_LANES)
> 
> >> can you use the previously computed 'ifn' here please?
> 
> Do you mean rewrite the codes as follows :?
> 
> internal_fn lanes_ifn = vect_store_lanes_supported (vectype, group_size, 
> false);
> 
> if (lanes_ifn == IFN_MASK_LEN_STORE_LANES).

The vect_store_lanes_supported is performed during analysis already
and ideally we'd not re-do such check, so please save it in a
variable at that point.
 
> >> I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.
> 
> Yeah, working on it and I will test on both X86 and ARM.
> 
> Thanks.
> 
> 
> juzhe.zh...@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-15 17:40
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford
> Subject: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into 
> vectorizer
> On Mon, 14 Aug 2023, juzhe.zh...@rivai.ai wrote:
>  
> > From: Ju-Zhe Zhong 
> > 
> > Hi, Richard and Richi.
> > 
> > This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into 
> > vectorizer.
> > 
> > Consider this simple case:
> > 
> > void __attribute__ ((noinline, noclone))
> > foo (int *__restrict a, int *__restrict b, int *__restrict c,
> >   int *__restrict d, int *__restrict e, int *__restrict f,
> >   int *__restrict g, int *__restrict h, int *__restrict j, int n)
> > {
> >   for (int i = 0; i < n; ++i)
> > {
> >   a[i] = j[i * 8];
> >   b[i] = j[i * 8 + 1];
> >   c[i] = j[i * 8 + 2];
> >   d[i] = j[i * 8 + 3];
> >   e[i] = j[i * 8 + 4];
> >   f[i] = j[i * 8 + 5];
> >   g[i] = j[i * 8 + 6];
> >   h[i] = j[i * 8 + 7];
> > }
> > }
> > 
> > RVV Gimple IR:
> > 
> >   _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
> >   ivtmp_125 = _79 * 32;
> >   vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, 
> > _79, 0);
> >   vect__8.9_122 = vect_array.8[0];
> >   vect__8.10_121 = vect_array.8[1];
> >   vect__8.11_120 = vect_array.8[2];
> >   vect__8.12_119 = vect_array.8[3];
> >   vect__8.13_118 = vect_array.8[4];
> >   vect__8.14_117 = vect_array.8[5];
> >   vect__8.15_116 = vect_array.8[6];
> >   vect__8.16_115 = vect_array.8[7];
> >   vect_array.8 ={v} {CLOBBER};
> >   ivtmp_114 = _79 * 4;
> >   .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
> >   .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, 
> > vect__8.10_121);
> >   .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, 
> > vect__8.11_120);
> >   .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, 
> > vect__8.12_119);
> >   .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
> >   .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
> >   .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
> >   .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);
> > 
> > ASM:
> > 
> > foo:
> > lw t4,8(sp)
> > ld t5,0(sp)
> > ble t4,zero,.L5
> > .L3:
> > vsetvli t1,t4,e8,mf4,ta,ma
> > vlseg8e32.v v8,(t5)
> > slli t3,t1,2
> > slli t6,t1,5
> > vse32.v v8,0(a0)
> > vse32.v v9,0(a1)
> > vse32.v v10,0(a2)
> > vse32.v v11,0(a3)
> > vse32.v v12,0(a4)
> > vse32.v v13,0(a5)
> > vse32.v v14,0(a6)
> > vse32.v v15,0(a7)
> > sub t4,t4,t1
> > add t5,t5,t6
> > add a0,a0,t3
> > add a1,a1,t3
> > add a2,a2,t3
> > add a3,a3,t3
> > add a4,a4,t3
> > add a5,a5,t3
> > add a6,a6,t3
> > add a7,a7,t3
> > bne t4,zero,.L3
> > .L5:
> > ret
> > 
> > The details of the approach:
> > 
> > Step 1 - Modifiy the LANES LOAD/STORE support function 
> > (vect_load_lanes_supported/vect_store_lanes_supported):
> > 
> > +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> > +   vectors of type VECTYPE.  MASKED_P says whether the masked form is 
> > needed. */
> >  
> > -bool
> > +internal_fn
> >  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
> > bool masked_p)
> >  {
> > -  if (masked_p)
> > -return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> > - vec_mask_load_lanes_optab,
> > - vectype, count);
> > +  if (vect_lanes_optab_

Re: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES, STORE_LANES} into vectorizer

2023-08-15 Thread juzhe.zh...@rivai.ai
Hi, Richi.

I realize this code perform analysis for load/store

+  internal_fn lanes_ifn;
   if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, 
vls_type,
ncopies, &memory_access_type, &poffset,
-   &alignment_support_scheme, &misalignment, &gs_info))
+   &alignment_support_scheme, &misalignment, &gs_info,
+   &lanes_ifn))

This function generate gather/scatter info "gs_info", using same approach.

add "&lanes_ifn" here which compute IFN for lanes load/store.

Does it reasonable ?

Thanks.


juzhe.zh...@rivai.ai
 
From: Richard Biener
Date: 2023-08-15 19:19
To: juzhe.zh...@rivai.ai
CC: gcc-patches; richard.sandiford
Subject: Re: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into 
vectorizer
On Tue, 15 Aug 2023, juzhe.zh...@rivai.ai wrote:
 
> Hi, Richi.
> 
> > +   if (vect_store_lanes_supported (vectype, group_size, false)
> > +   == IFN_MASK_LEN_STORE_LANES)
> 
> >> can you use the previously computed 'ifn' here please?
> 
> Do you mean rewrite the codes as follows :?
> 
> internal_fn lanes_ifn = vect_store_lanes_supported (vectype, group_size, 
> false);
> 
> if (lanes_ifn == IFN_MASK_LEN_STORE_LANES).
 
The vect_store_lanes_supported is performed during analysis already
and ideally we'd not re-do such check, so please save it in a
variable at that point.
> >> I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.
> 
> Yeah, working on it and I will test on both X86 and ARM.
> 
> Thanks.
> 
> 
> juzhe.zh...@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-15 17:40
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford
> Subject: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into 
> vectorizer
> On Mon, 14 Aug 2023, juzhe.zh...@rivai.ai wrote:
>  
> > From: Ju-Zhe Zhong 
> > 
> > Hi, Richard and Richi.
> > 
> > This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into 
> > vectorizer.
> > 
> > Consider this simple case:
> > 
> > void __attribute__ ((noinline, noclone))
> > foo (int *__restrict a, int *__restrict b, int *__restrict c,
> >   int *__restrict d, int *__restrict e, int *__restrict f,
> >   int *__restrict g, int *__restrict h, int *__restrict j, int n)
> > {
> >   for (int i = 0; i < n; ++i)
> > {
> >   a[i] = j[i * 8];
> >   b[i] = j[i * 8 + 1];
> >   c[i] = j[i * 8 + 2];
> >   d[i] = j[i * 8 + 3];
> >   e[i] = j[i * 8 + 4];
> >   f[i] = j[i * 8 + 5];
> >   g[i] = j[i * 8 + 6];
> >   h[i] = j[i * 8 + 7];
> > }
> > }
> > 
> > RVV Gimple IR:
> > 
> >   _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
> >   ivtmp_125 = _79 * 32;
> >   vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, 
> > _79, 0);
> >   vect__8.9_122 = vect_array.8[0];
> >   vect__8.10_121 = vect_array.8[1];
> >   vect__8.11_120 = vect_array.8[2];
> >   vect__8.12_119 = vect_array.8[3];
> >   vect__8.13_118 = vect_array.8[4];
> >   vect__8.14_117 = vect_array.8[5];
> >   vect__8.15_116 = vect_array.8[6];
> >   vect__8.16_115 = vect_array.8[7];
> >   vect_array.8 ={v} {CLOBBER};
> >   ivtmp_114 = _79 * 4;
> >   .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
> >   .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, 
> > vect__8.10_121);
> >   .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, 
> > vect__8.11_120);
> >   .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, 
> > vect__8.12_119);
> >   .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
> >   .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
> >   .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
> >   .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);
> > 
> > ASM:
> > 
> > foo:
> > lw t4,8(sp)
> > ld t5,0(sp)
> > ble t4,zero,.L5
> > .L3:
> > vsetvli t1,t4,e8,mf4,ta,ma
> > vlseg8e32.v v8,(t5)
> > slli t3,t1,2
> > slli t6,t1,5
> > vse32.v v8,0(a0)
> > vse32.v v9,0(a1)
> > vse32.v v10,0(a2)
> > vse32.v v11,0(a3)
> > vse32.v v12,0(a4)
> > vse32.v v13,0(a5)
> > vse32.v v14,0(a6)
> > vse32.v v15,0(a7)
> > sub t4,t4,t1
> > add t5,t5,t6
> > add a0,a0,t3
> > add a1,a1,t3
> > add a2,a2,t3
> > add a3,a3,t3
> > add a4,a4,t3
> > add

Re: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer

2023-08-15 Thread Richard Biener via Gcc-patches
On Tue, 15 Aug 2023, juzhe.zh...@rivai.ai wrote:

> Hi, Richi.
> 
> I realize this code perform analysis for load/store
> 
> +  internal_fn lanes_ifn;
>if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, 
> vls_type,
> ncopies, &memory_access_type, &poffset,
> -   &alignment_support_scheme, &misalignment, 
> &gs_info))
> +   &alignment_support_scheme, &misalignment, 
> &gs_info,
> +   &lanes_ifn))
> 
> This function generate gather/scatter info "gs_info", using same approach.
> 
> add "&lanes_ifn" here which compute IFN for lanes load/store.
> 
> Does it reasonable ?

Ah, OK.  I guess re-computing it is OK then (once).

Richard.

> Thanks.
> 
> 
> juzhe.zh...@rivai.ai
>  
> From: Richard Biener
> Date: 2023-08-15 19:19
> To: juzhe.zh...@rivai.ai
> CC: gcc-patches; richard.sandiford
> Subject: Re: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into 
> vectorizer
> On Tue, 15 Aug 2023, juzhe.zh...@rivai.ai wrote:
>  
> > Hi, Richi.
> > 
> > > +   if (vect_store_lanes_supported (vectype, group_size, false)
> > > +   == IFN_MASK_LEN_STORE_LANES)
> > 
> > >> can you use the previously computed 'ifn' here please?
> > 
> > Do you mean rewrite the codes as follows :?
> > 
> > internal_fn lanes_ifn = vect_store_lanes_supported (vectype, group_size, 
> > false);
> > 
> > if (lanes_ifn == IFN_MASK_LEN_STORE_LANES).
>  
> The vect_store_lanes_supported is performed during analysis already
> and ideally we'd not re-do such check, so please save it in a
> variable at that point.
> > >> I think the patch needs refreshing after r14-3214-ga74d0d36a3f337.
> > 
> > Yeah, working on it and I will test on both X86 and ARM.
> > 
> > Thanks.
> > 
> > 
> > juzhe.zh...@rivai.ai
> >  
> > From: Richard Biener
> > Date: 2023-08-15 17:40
> > To: Ju-Zhe Zhong
> > CC: gcc-patches; richard.sandiford
> > Subject: Re: [PATCH] VECT: Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into 
> > vectorizer
> > On Mon, 14 Aug 2023, juzhe.zh...@rivai.ai wrote:
> >  
> > > From: Ju-Zhe Zhong 
> > > 
> > > Hi, Richard and Richi.
> > > 
> > > This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into 
> > > vectorizer.
> > > 
> > > Consider this simple case:
> > > 
> > > void __attribute__ ((noinline, noclone))
> > > foo (int *__restrict a, int *__restrict b, int *__restrict c,
> > >   int *__restrict d, int *__restrict e, int *__restrict f,
> > >   int *__restrict g, int *__restrict h, int *__restrict j, int n)
> > > {
> > >   for (int i = 0; i < n; ++i)
> > > {
> > >   a[i] = j[i * 8];
> > >   b[i] = j[i * 8 + 1];
> > >   c[i] = j[i * 8 + 2];
> > >   d[i] = j[i * 8 + 3];
> > >   e[i] = j[i * 8 + 4];
> > >   f[i] = j[i * 8 + 5];
> > >   g[i] = j[i * 8 + 6];
> > >   h[i] = j[i * 8 + 7];
> > > }
> > > }
> > > 
> > > RVV Gimple IR:
> > > 
> > >   _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
> > >   ivtmp_125 = _79 * 32;
> > >   vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, 
> > > _79, 0);
> > >   vect__8.9_122 = vect_array.8[0];
> > >   vect__8.10_121 = vect_array.8[1];
> > >   vect__8.11_120 = vect_array.8[2];
> > >   vect__8.12_119 = vect_array.8[3];
> > >   vect__8.13_118 = vect_array.8[4];
> > >   vect__8.14_117 = vect_array.8[5];
> > >   vect__8.15_116 = vect_array.8[6];
> > >   vect__8.16_115 = vect_array.8[7];
> > >   vect_array.8 ={v} {CLOBBER};
> > >   ivtmp_114 = _79 * 4;
> > >   .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, 
> > > vect__8.9_122);
> > >   .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, 
> > > vect__8.10_121);
> > >   .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, 
> > > vect__8.11_120);
> > >   .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, 
> > > vect__8.12_119);
> > >   .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, 
> > > vect__8.13_118);
> > >   .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, 
> > > vect__8.14_117);
> > >   .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _7