Re: [5/6] Account for the cost of generating loop masks

2019-11-06 Thread Richard Biener
On Tue, Nov 5, 2019 at 3:31 PM Richard Sandiford
 wrote:
>
> We didn't take the cost of generating loop masks into account, and so
> tended to underestimate the cost of loops that need multiple masks.

OK.

>
> 2019-11-05  Richard Sandiford  
>
> gcc/
> * tree-vect-loop.c (vect_estimate_min_profitable_iters): Include
> the cost of generating loop masks.
>
> gcc/testsuite/
> * gcc.target/aarch64/sve/mask_struct_store_3.c: Add
> -fno-vect-cost-model.
> * gcc.target/aarch64/sve/mask_struct_store_3_run.c: Likewise.
> * gcc.target/aarch64/sve/peel_ind_3.c: Likewise.
> * gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise.
>
> Index: gcc/tree-vect-loop.c
> ===
> --- gcc/tree-vect-loop.c2019-11-05 14:19:58.781197820 +
> +++ gcc/tree-vect-loop.c2019-11-05 14:20:40.188909187 +
> @@ -3435,6 +3435,32 @@ vect_estimate_min_profitable_iters (loop
>   si->kind, si->stmt_info, si->misalign,
>   vect_epilogue);
> }
> +
> +  /* Calculate how many masks we need to generate.  */
> +  unsigned int num_masks = 0;
> +  rgroup_masks *rgm;
> +  unsigned int num_vectors_m1;
> +  FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
> +   if (rgm->mask_type)
> + num_masks += num_vectors_m1 + 1;
> +  gcc_assert (num_masks > 0);
> +
> +  /* In the worst case, we need to generate each mask in the prologue
> +and in the loop body.  One of the loop body mask instructions
> +replaces the comparison in the scalar loop, and since we don't
> +count the scalar comparison against the scalar body, we shouldn't
> +count that vector instruction against the vector body either.
> +
> +Sometimes we can use unpacks instead of generating prologue
> +masks and sometimes the prologue mask will fold to a constant,
> +so the actual prologue cost might be smaller.  However, it's
> +simpler and safer to use the worst-case cost; if this ends up
> +being the tie-breaker between vectorizing or not, then it's
> +probably better not to vectorize.  */
> +  (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
> +   NULL, 0, vect_prologue);
> +  (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
> +   NULL, 0, vect_body);
>  }
>else if (npeel < 0)
>  {
> Index: gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c
> ===
> --- gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c  2019-03-08 
> 18:14:29.768994780 +
> +++ gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c  2019-11-05 
> 14:20:40.184909216 +
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
> +/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
>
>  #include 
>
> Index: gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c
> ===
> --- gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c  
> 2019-03-08 18:14:29.772994767 +
> +++ gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c  
> 2019-11-05 14:20:40.184909216 +
> @@ -1,5 +1,5 @@
>  /* { dg-do run { target aarch64_sve_hw } } */
> -/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
> +/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
>
>  #include "mask_struct_store_3.c"
>
> Index: gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c
> ===
> --- gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c   2019-03-08 
> 18:14:29.776994751 +
> +++ gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c   2019-11-05 
> 14:20:40.184909216 +
> @@ -1,7 +1,7 @@
>  /* { dg-do compile } */
>  /* Pick an arbitrary target for which unaligned accesses are more
> expensive.  */
> -/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */
> +/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx 
> -fno-vect-cost-model" } */
>
>  #define N 32
>  #define MAX_START 8
> Index: gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c
> ===
> --- gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c   2019-03-08 
> 18:14:29.784994721 +
> +++ gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c   2019-11-05 
> 14:20:40.184909216 +
> @@ -1,6 +1,6 @@
>  /* { dg-do run { target aarch64_sve_hw } } */
> -/* { dg-options "-O3 -mtune=thunderx" } */
> -/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target 
> aarch64_sve256_hw } } */
> +/* { dg-options "-O3 -mtune=thunderx 

[5/6] Account for the cost of generating loop masks

2019-11-05 Thread Richard Sandiford
We didn't take the cost of generating loop masks into account, and so
tended to underestimate the cost of loops that need multiple masks.


2019-11-05  Richard Sandiford  

gcc/
* tree-vect-loop.c (vect_estimate_min_profitable_iters): Include
the cost of generating loop masks.

gcc/testsuite/
* gcc.target/aarch64/sve/mask_struct_store_3.c: Add
-fno-vect-cost-model.
* gcc.target/aarch64/sve/mask_struct_store_3_run.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_3.c: Likewise.
* gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise.

Index: gcc/tree-vect-loop.c
===
--- gcc/tree-vect-loop.c2019-11-05 14:19:58.781197820 +
+++ gcc/tree-vect-loop.c2019-11-05 14:20:40.188909187 +
@@ -3435,6 +3435,32 @@ vect_estimate_min_profitable_iters (loop
  si->kind, si->stmt_info, si->misalign,
  vect_epilogue);
}
+
+  /* Calculate how many masks we need to generate.  */
+  unsigned int num_masks = 0;
+  rgroup_masks *rgm;
+  unsigned int num_vectors_m1;
+  FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
+   if (rgm->mask_type)
+ num_masks += num_vectors_m1 + 1;
+  gcc_assert (num_masks > 0);
+
+  /* In the worst case, we need to generate each mask in the prologue
+and in the loop body.  One of the loop body mask instructions
+replaces the comparison in the scalar loop, and since we don't
+count the scalar comparison against the scalar body, we shouldn't
+count that vector instruction against the vector body either.
+
+Sometimes we can use unpacks instead of generating prologue
+masks and sometimes the prologue mask will fold to a constant,
+so the actual prologue cost might be smaller.  However, it's
+simpler and safer to use the worst-case cost; if this ends up
+being the tie-breaker between vectorizing or not, then it's
+probably better not to vectorize.  */
+  (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
+   NULL, 0, vect_prologue);
+  (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
+   NULL, 0, vect_body);
 }
   else if (npeel < 0)
 {
Index: gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c
===
--- gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c  2019-03-08 
18:14:29.768994780 +
+++ gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c  2019-11-05 
14:20:40.184909216 +
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
 
 #include 
 
Index: gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c
===
--- gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c  
2019-03-08 18:14:29.772994767 +
+++ gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c  
2019-11-05 14:20:40.184909216 +
@@ -1,5 +1,5 @@
 /* { dg-do run { target aarch64_sve_hw } } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
 
 #include "mask_struct_store_3.c"
 
Index: gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c
===
--- gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c   2019-03-08 
18:14:29.776994751 +
+++ gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c   2019-11-05 
14:20:40.184909216 +
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* Pick an arbitrary target for which unaligned accesses are more
expensive.  */
-/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */
+/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx 
-fno-vect-cost-model" } */
 
 #define N 32
 #define MAX_START 8
Index: gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c
===
--- gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c   2019-03-08 
18:14:29.784994721 +
+++ gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c   2019-11-05 
14:20:40.184909216 +
@@ -1,6 +1,6 @@
 /* { dg-do run { target aarch64_sve_hw } } */
-/* { dg-options "-O3 -mtune=thunderx" } */
-/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target 
aarch64_sve256_hw } } */
+/* { dg-options "-O3 -mtune=thunderx -fno-vect-cost-model" } */
+/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256 
-fno-vect-cost-model" { target aarch64_sve256_hw } } */
 
 #include "peel_ind_3.c"