> +  /* Segment load/store permute cost.  */
> +  const int segment_permute_2;
> +  const int segment_permute_4;
> +  const int segment_permute_8;
> 
> Why do we only have 2/4/8, I think we should have 2/3/4/5/6/7/8

No idea why I posted that (wrong) version, I used it for
some testing locally.  Attached is the proper version, still
called it v3...

Regards
 Robin

Subject: [PATCH v3] RISC-V: Add initial cost handling for segment
 loads/stores.

This patch makes segment loads and stores more expensive.  It adds
segment_permute_2 as well as 3 to 8 cost fields to the common vector
costs and adds handling to adjust_stmt_cost.

gcc/ChangeLog:

        * config/riscv/riscv-protos.h (struct common_vector_cost): Add
        segment_permute cost.
        * config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost):
        Handle segment loads/stores.
        * config/riscv/riscv.cc: Initialize segment_permute_[2-8] to 1.
---
 gcc/config/riscv/riscv-protos.h        |   9 ++
 gcc/config/riscv/riscv-vector-costs.cc | 163 ++++++++++++++++++-------
 gcc/config/riscv/riscv.cc              |  14 +++
 3 files changed, 144 insertions(+), 42 deletions(-)

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 80efdf2b7e5..90d1fcbb3b1 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -218,6 +218,15 @@ struct common_vector_cost
   const int gather_load_cost;
   const int scatter_store_cost;
 
+  /* Segment load/store permute cost.  */
+  const int segment_permute_2;
+  const int segment_permute_3;
+  const int segment_permute_4;
+  const int segment_permute_5;
+  const int segment_permute_6;
+  const int segment_permute_7;
+  const int segment_permute_8;
+
   /* Cost of a vector-to-scalar operation.  */
   const int vec_to_scalar_cost;
 
diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index adf9c197df5..f4da213fe14 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1043,6 +1043,25 @@ costs::better_main_loop_than_p (const vector_costs 
*uncast_other) const
   return vector_costs::better_main_loop_than_p (other);
 }
 
+/* Returns the group size i.e. the number of vectors to be loaded by a
+   segmented load/store instruction.  Return 0 if it is no segmented
+   load/store.  */
+static int
+segment_loadstore_group_size (enum vect_cost_for_stmt kind,
+                             stmt_vec_info stmt_info)
+{
+  if (stmt_info
+      && (kind == vector_load || kind == vector_store)
+      && STMT_VINFO_DATA_REF (stmt_info))
+    {
+      stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      if (stmt_info
+         && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+       return DR_GROUP_SIZE (stmt_info);
+    }
+  return 0;
+}
+
 /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
    For some statement, we would like to further fine-grain tweak the cost on
    top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1067,55 +1086,115 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, 
loop_vec_info loop,
     case vector_load:
     case vector_store:
        {
-         /* Unit-stride vector loads and stores do not have offset addressing
-            as opposed to scalar loads and stores.
-            If the address depends on a variable we need an additional
-            add/sub for each load/store in the worst case.  */
-         if (stmt_info && stmt_info->stmt)
+         if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info))
            {
-             data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
-             class loop *father = stmt_info->stmt->bb->loop_father;
-             if (!loop && father && !father->inner && father->superloops)
+             /* Segment loads and stores.  When the group size is > 1
+                the vectorizer will add a vector load/store statement for
+                each vector in the group.  Here we additionally add permute
+                costs for each.  */
+             /* TODO: Indexed and ordered/unordered cost.  */
+             int group_size = segment_loadstore_group_size (kind, stmt_info);
+             if (group_size > 1)
+               {
+                 switch (group_size)
+                   {
+                   case 2:
+                     if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+                       stmt_cost += costs->vla->segment_permute_2;
+                     else
+                       stmt_cost += costs->vls->segment_permute_2;
+                     break;
+                   case 3:
+                     if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+                       stmt_cost += costs->vla->segment_permute_3;
+                     else
+                       stmt_cost += costs->vls->segment_permute_3;
+                     break;
+                   case 4:
+                     if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+                       stmt_cost += costs->vla->segment_permute_4;
+                     else
+                       stmt_cost += costs->vls->segment_permute_4;
+                     break;
+                   case 5:
+                     if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+                       stmt_cost += costs->vla->segment_permute_5;
+                     else
+                       stmt_cost += costs->vls->segment_permute_5;
+                     break;
+                   case 6:
+                     if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+                       stmt_cost += costs->vla->segment_permute_6;
+                     else
+                       stmt_cost += costs->vls->segment_permute_6;
+                     break;
+                   case 7:
+                     if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+                       stmt_cost += costs->vla->segment_permute_7;
+                     else
+                       stmt_cost += costs->vls->segment_permute_7;
+                     break;
+                   case 8:
+                     if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+                       stmt_cost += costs->vla->segment_permute_8;
+                     else
+                       stmt_cost += costs->vls->segment_permute_8;
+                     break;
+                   default:
+                     break;
+                   }
+               }
+             else
                {
-                 tree ref;
-                 if (TREE_CODE (dr->ref) != MEM_REF
-                     || !(ref = TREE_OPERAND (dr->ref, 0))
-                     || TREE_CODE (ref) != SSA_NAME)
-                   break;
+                 /* Unit-stride vector loads and stores do not have offset
+                    addressing as opposed to scalar loads and stores.
+                    If the address depends on a variable we need an additional
+                    add/sub for each load/store in the worst case.  */
+                 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+                 class loop *father = stmt_info->stmt->bb->loop_father;
+                 if (!loop && father && !father->inner && father->superloops)
+                   {
+                     tree ref;
+                     if (TREE_CODE (dr->ref) != MEM_REF
+                         || !(ref = TREE_OPERAND (dr->ref, 0))
+                         || TREE_CODE (ref) != SSA_NAME)
+                       break;
 
-                 if (SSA_NAME_IS_DEFAULT_DEF (ref))
-                   break;
+                     if (SSA_NAME_IS_DEFAULT_DEF (ref))
+                       break;
 
-                 if (memrefs.contains ({ref, cst0}))
-                   break;
+                     if (memrefs.contains ({ref, cst0}))
+                       break;
 
-                 memrefs.add ({ref, cst0});
+                     memrefs.add ({ref, cst0});
 
-                 /* In case we have not seen REF before and the base address
-                    is a pointer operation try a bit harder.  */
-                 tree base = DR_BASE_ADDRESS (dr);
-                 if (TREE_CODE (base) == POINTER_PLUS_EXPR
-                     || TREE_CODE (base) == POINTER_DIFF_EXPR)
-                   {
-                     /* Deconstruct BASE's first operand.  If it is a binary
-                        operation, i.e. a base and an "offset" store this
-                        pair.  Only increase the stmt_cost if we haven't seen
-                        it before.  */
-                     tree argp = TREE_OPERAND (base, 1);
-                     typedef std::pair<tree, tree> addr_pair;
-                     addr_pair pair;
-                     if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+                     /* In case we have not seen REF before and the base
+                        address is a pointer operation try a bit harder.  */
+                     tree base = DR_BASE_ADDRESS (dr);
+                     if (TREE_CODE (base) == POINTER_PLUS_EXPR
+                         || TREE_CODE (base) == POINTER_DIFF_EXPR)
                        {
-                         tree argp0 = tree_strip_nop_conversions
-                           (TREE_OPERAND (argp, 0));
-                         tree argp1 = TREE_OPERAND (argp, 1);
-                         pair = addr_pair (argp0, argp1);
-                         if (memrefs.contains (pair))
-                           break;
-
-                         memrefs.add (pair);
-                         stmt_cost += builtin_vectorization_cost (scalar_stmt,
-                                                                  NULL_TREE, 
0);
+                         /* Deconstruct BASE's first operand.  If it is a
+                            binary operation, i.e. a base and an "offset"
+                            store this pair.  Only increase the stmt_cost if
+                            we haven't seen it before.  */
+                         tree argp = TREE_OPERAND (base, 1);
+                         typedef std::pair<tree, tree> addr_pair;
+                         addr_pair pair;
+                         if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+                           {
+                             tree argp0 = tree_strip_nop_conversions
+                               (TREE_OPERAND (argp, 0));
+                             tree argp1 = TREE_OPERAND (argp, 1);
+                             pair = addr_pair (argp0, argp1);
+                             if (memrefs.contains (pair))
+                               break;
+
+                             memrefs.add (pair);
+                             stmt_cost
+                               += builtin_vectorization_cost (scalar_stmt,
+                                                              NULL_TREE, 0);
+                           }
                        }
                    }
                }
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 5e984ee2a55..874a42873d7 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -365,6 +365,13 @@ static const common_vector_cost rvv_vls_vector_cost = {
   1, /* fp_stmt_cost  */
   1, /* gather_load_cost  */
   1, /* scatter_store_cost  */
+  1, /* segment_permute (2) */
+  1, /* segment_permute (3) */
+  1, /* segment_permute (4) */
+  1, /* segment_permute (5) */
+  1, /* segment_permute (6) */
+  1, /* segment_permute (7) */
+  1, /* segment_permute (8) */
   1, /* vec_to_scalar_cost  */
   1, /* scalar_to_vec_cost  */
   1, /* permute_cost  */
@@ -381,6 +388,13 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
     1, /* fp_stmt_cost  */
     1, /* gather_load_cost  */
     1, /* scatter_store_cost  */
+    1, /* segment_permute (2) */
+    1, /* segment_permute (3) */
+    1, /* segment_permute (4) */
+    1, /* segment_permute (5) */
+    1, /* segment_permute (6) */
+    1, /* segment_permute (7) */
+    1, /* segment_permute (8) */
     1, /* vec_to_scalar_cost  */
     1, /* scalar_to_vec_cost  */
     1, /* permute_cost  */
-- 
2.43.2

Reply via email to