[gcc r14-9344] RISC-V: Adjust vec unit-stride load/store costs.

Robin Dapp via Gcc-cvs Wed, 06 Mar 2024 11:42:29 -0800

https://gcc.gnu.org/g:9ae83078fe45d093bbaa02b8348f2407fe0c62d6


commit r14-9344-g9ae83078fe45d093bbaa02b8348f2407fe0c62d6
Author: Robin Dapp <rd...@ventanamicro.com>
Date:   Mon Jan 15 17:34:58 2024 +0100

    RISC-V: Adjust vec unit-stride load/store costs.
    
    Scalar loads provide offset addressing while unit-stride vector
    instructions cannot.  The offset must be loaded into a general-purpose
    register before it can be used.  In order to account for this, this
    patch adds an address arithmetic heuristic that keeps track of data
    reference operands.  If we haven't seen the operand before we add the
    cost of a scalar statement.
    
    This helps to get rid of an lbm regression when vectorizing (roughly
    0.5% fewer dynamic instructions).  gcc5 improves by 0.2% and deepsjeng
    by 0.25%.  wrf and nab degrade by 0.1%.  This is because before we now
    adjust the cost of SLP as well as loop-vectorized instructions whereas
    we would only adjust loop-vectorized instructions before.
    Considering higher scalar_to_vec costs (3 vs 1) for all vectorization
    types causes some snippets not to get vectorized anymore.  Given these
    costs the decision looks correct but appears worse when just counting
    dynamic instructions.
    
    In total SPECint 2017 has 4 bln dynamic instructions less and SPECfp 0.7
    bln.
    
    gcc/ChangeLog:
    
            * config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Move...
            (costs::adjust_stmt_cost): ... to here and add vec_load/vec_store
            offset handling.
            (costs::add_stmt_cost): Also adjust cost for statements without
            stmt_info.
            * config/riscv/riscv-vector-costs.h: Define zero constant.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.dg/vect/costmodel/riscv/rvv/vse-slp-1.c: New test.
            * gcc.dg/vect/costmodel/riscv/rvv/vse-slp-2.c: New test.

Diff:
---
 gcc/config/riscv/riscv-vector-costs.cc             | 86 +++++++++++++++++++---
 gcc/config/riscv/riscv-vector-costs.h              | 10 +++
 .../gcc.dg/vect/costmodel/riscv/rvv/vse-slp-1.c    | 51 +++++++++++++
 .../gcc.dg/vect/costmodel/riscv/rvv/vse-slp-2.c    | 51 +++++++++++++
 4 files changed, 188 insertions(+), 10 deletions(-)

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 7c9840df4e9..adf9c197df5 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -42,6 +42,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "backend.h"
 #include "tree-data-ref.h"
 #include "tree-ssa-loop-niter.h"
+#include "tree-hash-traits.h"
 
 /* This file should be included last.  */
 #include "riscv-vector-costs.h"
@@ -1047,18 +1048,81 @@ costs::better_main_loop_than_p (const vector_costs 
*uncast_other) const
    top of riscv_builtin_vectorization_cost handling which doesn't have any
    information on statement operation codes etc.  */
 
-static unsigned
-adjust_stmt_cost (enum vect_cost_for_stmt kind, tree vectype, int stmt_cost)
+unsigned
+costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
+                        stmt_vec_info stmt_info,
+                        slp_tree, tree vectype, int stmt_cost)
 {
   const cpu_vector_cost *costs = get_vector_costs ();
   switch (kind)
     {
     case scalar_to_vec:
-      return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
-                                                 : costs->regmove->GR2VR);
+      stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
+                   : costs->regmove->GR2VR);
+      break;
     case vec_to_scalar:
-      return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR
-                                                 : costs->regmove->VR2GR);
+      stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR
+                   : costs->regmove->VR2GR);
+      break;
+    case vector_load:
+    case vector_store:
+       {
+         /* Unit-stride vector loads and stores do not have offset addressing
+            as opposed to scalar loads and stores.
+            If the address depends on a variable we need an additional
+            add/sub for each load/store in the worst case.  */
+         if (stmt_info && stmt_info->stmt)
+           {
+             data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+             class loop *father = stmt_info->stmt->bb->loop_father;
+             if (!loop && father && !father->inner && father->superloops)
+               {
+                 tree ref;
+                 if (TREE_CODE (dr->ref) != MEM_REF
+                     || !(ref = TREE_OPERAND (dr->ref, 0))
+                     || TREE_CODE (ref) != SSA_NAME)
+                   break;
+
+                 if (SSA_NAME_IS_DEFAULT_DEF (ref))
+                   break;
+
+                 if (memrefs.contains ({ref, cst0}))
+                   break;
+
+                 memrefs.add ({ref, cst0});
+
+                 /* In case we have not seen REF before and the base address
+                    is a pointer operation try a bit harder.  */
+                 tree base = DR_BASE_ADDRESS (dr);
+                 if (TREE_CODE (base) == POINTER_PLUS_EXPR
+                     || TREE_CODE (base) == POINTER_DIFF_EXPR)
+                   {
+                     /* Deconstruct BASE's first operand.  If it is a binary
+                        operation, i.e. a base and an "offset" store this
+                        pair.  Only increase the stmt_cost if we haven't seen
+                        it before.  */
+                     tree argp = TREE_OPERAND (base, 1);
+                     typedef std::pair<tree, tree> addr_pair;
+                     addr_pair pair;
+                     if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+                       {
+                         tree argp0 = tree_strip_nop_conversions
+                           (TREE_OPERAND (argp, 0));
+                         tree argp1 = TREE_OPERAND (argp, 1);
+                         pair = addr_pair (argp0, argp1);
+                         if (memrefs.contains (pair))
+                           break;
+
+                         memrefs.add (pair);
+                         stmt_cost += builtin_vectorization_cost (scalar_stmt,
+                                                                  NULL_TREE, 
0);
+                       }
+                   }
+               }
+           }
+         break;
+       }
+
     default:
       break;
     }
@@ -1067,7 +1131,7 @@ adjust_stmt_cost (enum vect_cost_for_stmt kind, tree 
vectype, int stmt_cost)
 
 unsigned
 costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
-                     stmt_vec_info stmt_info, slp_tree, tree vectype,
+                     stmt_vec_info stmt_info, slp_tree node, tree vectype,
                      int misalign, vect_cost_model_location where)
 {
   int stmt_cost
@@ -1080,6 +1144,7 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
       if (loop_vinfo)
        analyze_loop_vinfo (loop_vinfo);
 
+      memrefs.empty ();
       m_analyzed_vinfo = true;
     }
 
@@ -1092,11 +1157,12 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt 
kind,
         as one iteration of the VLA loop.  */
       if (where == vect_body && m_unrolled_vls_niters)
        m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
-
-      if (vectype)
-       stmt_cost = adjust_stmt_cost (kind, vectype, stmt_cost);
     }
 
+  if (vectype)
+    stmt_cost = adjust_stmt_cost (kind, loop_vinfo, stmt_info, node, vectype,
+                                 stmt_cost);
+
   return record_stmt_cost (stmt_info, where, count * stmt_cost);
 }
 
diff --git a/gcc/config/riscv/riscv-vector-costs.h 
b/gcc/config/riscv/riscv-vector-costs.h
index 4e2bbfd5ca9..ca0ef1199b2 100644
--- a/gcc/config/riscv/riscv-vector-costs.h
+++ b/gcc/config/riscv/riscv-vector-costs.h
@@ -85,6 +85,12 @@ private:
   unsigned HOST_WIDE_INT m_unrolled_vls_niters = 0;
   unsigned HOST_WIDE_INT m_unrolled_vls_stmts = 0;
 
+  tree cst0 = build_int_cst (integer_type_node, 0);
+
+  /* Store the memory references already processed.  */
+  typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
+  hash_set <tree_pair_hash> memrefs;
+
   void analyze_loop_vinfo (loop_vec_info);
   void record_potential_vls_unrolling (loop_vec_info);
   bool prefer_unrolled_loop () const;
@@ -98,6 +104,10 @@ private:
   void record_potential_unexpected_spills (loop_vec_info);
 
   void adjust_vect_cost_per_loop (loop_vec_info);
+  unsigned adjust_stmt_cost (enum vect_cost_for_stmt kind,
+                            loop_vec_info,
+                            stmt_vec_info stmt_info, slp_tree,
+                            tree vectype, int stmt_cost);
 };
 
 } // namespace riscv_vector
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-1.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-1.c
new file mode 100644
index 00000000000..530146a6d31
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-1.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fdump-tree-slp1-details" } */
+
+#define f1 (1.0 / 3.0)
+#define f2 (1.0 / 18.0)
+#define f3 (1.0 / 36.0)
+
+#define SIZE_X 10
+#define SIZE_Y 10
+#define SIZE_Z 10
+
+typedef enum {C = 0,
+              N, S, E, W, T, B,
+              NE, NW, SE, SW,
+              NT, NB, ST, SB,
+              ET, EB, WT, WB,
+              FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+
+#define CALC_INDEX(x,y,z,e) ((e)+N_CELL_ENTRIES*((x)+ \
+                             (y)*SIZE_X+(z)*SIZE_X*SIZE_Y))
+#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX(dx, dy, dz, e)+(i)])
+#define LOCAL(g,e)       (GRID_ENTRY_SWEEP (g, 0, 0, 0, e))
+
+void foo (double *grid)
+{
+    for( int i = CALC_INDEX(0, 0, -2, 0); \
+       i < CALC_INDEX(0, 0, SIZE_Z + 2, 0); \
+       i += N_CELL_ENTRIES ) {
+       LOCAL (grid, C ) = f1;
+       LOCAL (grid, N ) = f2;
+       LOCAL (grid, S ) = f2;
+       LOCAL (grid, E ) = f2;
+       LOCAL (grid, W ) = f2;
+       LOCAL (grid, T ) = f2;
+       LOCAL (grid, B ) = f2;
+       LOCAL (grid, NE) = f3;
+       LOCAL (grid, NW) = f3;
+       LOCAL (grid, SE) = f3;
+       LOCAL (grid, SW) = f3;
+       LOCAL (grid, NT) = f3;
+       LOCAL (grid, NB) = f3;
+       LOCAL (grid, ST) = f3;
+       LOCAL (grid, SB) = f3;
+       LOCAL (grid, ET) = f3;
+       LOCAL (grid, EB) = f3;
+       LOCAL (grid, WT) = f3;
+       LOCAL (grid, WB) = f3;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized using SLP" 0 "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-2.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-2.c
new file mode 100644
index 00000000000..7650a0e40fc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-2.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fdump-tree-slp1-details" } */
+
+#define f1 3
+#define f2 4
+#define f3 5
+
+#define SIZE_X 10
+#define SIZE_Y 10
+#define SIZE_Z 10
+
+typedef enum {C = 0,
+              N, S, E, W, T, B,
+              NE, NW, SE, SW,
+              NT, NB, ST, SB,
+              ET, EB, WT, WB,
+              FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+
+#define CALC_INDEX(x,y,z,e) ((e)+N_CELL_ENTRIES*((x)+ \
+                             (y)*SIZE_X+(z)*SIZE_X*SIZE_Y))
+#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX(dx, dy, dz, e)+(i)])
+#define LOCAL(g,e)       (GRID_ENTRY_SWEEP (g, 0, 0, 0, e))
+
+void foo (unsigned long *grid)
+{
+    for( int i = CALC_INDEX(0, 0, -2, 0); \
+       i < CALC_INDEX(0, 0, SIZE_Z + 2, 0); \
+       i += N_CELL_ENTRIES ) {
+       LOCAL (grid, C ) = f1;
+       LOCAL (grid, N ) = f2;
+       LOCAL (grid, S ) = f2;
+       LOCAL (grid, E ) = f2;
+       LOCAL (grid, W ) = f2;
+       LOCAL (grid, T ) = f2;
+       LOCAL (grid, B ) = f2;
+       LOCAL (grid, NE) = f3;
+       LOCAL (grid, NW) = f3;
+       LOCAL (grid, SE) = f3;
+       LOCAL (grid, SW) = f3;
+       LOCAL (grid, NT) = f3;
+       LOCAL (grid, NB) = f3;
+       LOCAL (grid, ST) = f3;
+       LOCAL (grid, SB) = f3;
+       LOCAL (grid, ET) = f3;
+       LOCAL (grid, EB) = f3;
+       LOCAL (grid, WT) = f3;
+       LOCAL (grid, WB) = f3;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized using SLP" 0 "slp1" } } */

[gcc r14-9344] RISC-V: Adjust vec unit-stride load/store costs.

Reply via email to