[PATCH 03/07] RISC-V: Add auto-vectorization support

Michael Collison Thu, 02 Mar 2023 20:53:17 -0800

This patches adds two new files to support the vector cost model andmodifies the Makefile fragment to build the cost model c++ file. Due tothe large size this patch is provided as an attachment.


gcc/ChangeLog:


    * gcc/config.gcc (riscv-vector-cost.o): New object file to build.
    * config/riscv/riscv-vector-cost.cc: New file for riscv vector cost
    model
    * config/riscv/riscv-vector-cost.h: New header file for riscv vector
    cost model.
    * config/riscv/t-riscv: Add make rule for riscv-vector-cost.o.

From eb995818cd5f77f85e8df93b690b00ce1fd1aa35 Mon Sep 17 00:00:00 2001
From: Michael Collison <colli...@rivosinc.com>
Date: Thu, 2 Mar 2023 12:27:36 -0500
Subject: [PATCH] Autovectorization patch set 2

---
 gcc/config.gcc                        |   2 +-
 gcc/config/riscv/riscv-vector-cost.cc | 620 ++++++++++++++++++++++++++
 gcc/config/riscv/riscv-vector-cost.h  | 400 +++++++++++++++++
 gcc/config/riscv/t-riscv              |   5 +
 4 files changed, 1026 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/riscv/riscv-vector-cost.cc
 create mode 100644 gcc/config/riscv/riscv-vector-cost.h

diff --git a/gcc/config.gcc b/gcc/config.gcc
index c070e6ecd2e..a4017777187 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -530,7 +530,7 @@ pru-*-*)
 riscv*)
 	cpu_type=riscv
 	extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-selftests.o riscv-v.o riscv-vsetvl.o"
-	extra_objs="${extra_objs} riscv-vector-builtins.o riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o"
+	extra_objs="${extra_objs} riscv-vector-cost.o riscv-vector-builtins.o riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o"
 	d_target_objs="riscv-d.o"
 	extra_headers="riscv_vector.h"
 	target_gtfiles="$target_gtfiles \$(srcdir)/config/riscv/riscv-vector-builtins.cc"
diff --git a/gcc/config/riscv/riscv-vector-cost.cc b/gcc/config/riscv/riscv-vector-cost.cc
new file mode 100644
index 00000000000..5a33b20843a
--- /dev/null
+++ b/gcc/config/riscv/riscv-vector-cost.cc
@@ -0,0 +1,620 @@
+/* Cost model implementation for RISC-V 'V' Extension for GNU compiler.
+   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+   Contributed by Juzhe Zhong (juzhe.zh...@rivai.ai), RiVAI Technologies Ltd.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#define INCLUDE_STRING
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "backend.h"
+#include "rtl.h"
+#include "regs.h"
+#include "insn-config.h"
+#include "insn-attr.h"
+#include "recog.h"
+#include "rtlanal.h"
+#include "output.h"
+#include "alias.h"
+#include "tree.h"
+#include "stringpool.h"
+#include "attribs.h"
+#include "varasm.h"
+#include "stor-layout.h"
+#include "calls.h"
+#include "function.h"
+#include "explow.h"
+#include "memmodel.h"
+#include "emit-rtl.h"
+#include "reload.h"
+#include "tm_p.h"
+#include "target.h"
+#include "basic-block.h"
+#include "expr.h"
+#include "optabs.h"
+#include "bitmap.h"
+#include "df.h"
+#include "diagnostic.h"
+#include "builtins.h"
+#include "predict.h"
+#include "tree-pass.h"
+#include "opts.h"
+#include "langhooks.h"
+#include "rtl-iter.h"
+#include "gimple.h"
+#include "cfghooks.h"
+#include "cfgloop.h"
+#include "fold-const.h"
+#include "gimple-iterator.h"
+#include "tree-vectorizer.h"
+#include "tree-ssa-loop-niter.h"
+#include "riscv-vector-builtins.h"
+
+/* This file should be included last.  */
+#include "riscv-vector-cost.h"
+#include "target-def.h"
+
+bool vector_insn_cost_table::get_cost(rtx x, machine_mode mode, int *cost,
+                                      bool speed) const {
+  rtx op0, op1, op2;
+  enum rtx_code code = GET_CODE(x);
+  scalar_int_mode int_mode;
+
+  /* By default, assume that everything has equivalent cost to the
+     cheapest instruction.  Any additional costs are applied as a delta
+     above this default.  */
+  *cost = COSTS_N_INSNS(1);
+
+  switch (code) {
+  case SET:
+    /* The cost depends entirely on the operands to SET.  */
+    *cost = 0;
+    op0 = SET_DEST(x);
+    op1 = SET_SRC(x);
+
+    switch (GET_CODE(op0)) {
+    case MEM:
+      if (speed) {
+        *cost += store->cost(x, mode);
+      }
+
+      //*cost += rtx_cost(op1, mode, SET, 1, speed);
+      return true;
+
+    case SUBREG:
+      if (!REG_P(SUBREG_REG(op0)))
+        *cost += rtx_cost(SUBREG_REG(op0), VOIDmode, SET, 0, speed);
+
+      /* Fall through.  */
+    case REG:
+      /* The cost is one per vector-register copied.  */
+      if (VECTOR_MODE_P(GET_MODE(op0))) {
+        *cost = mov->cost(x, mode);
+      } else
+        /* Cost is just the cost of the RHS of the set.  */
+        *cost += rtx_cost(op1, mode, SET, 1, speed);
+      return true;
+
+    case ZERO_EXTRACT:
+    case SIGN_EXTRACT:
+      /* Bit-field insertion.  Strip any redundant widening of
+         the RHS to meet the width of the target.  */
+      if (SUBREG_P(op1))
+        op1 = SUBREG_REG(op1);
+      if ((GET_CODE(op1) == ZERO_EXTEND || GET_CODE(op1) == SIGN_EXTEND) &&
+          CONST_INT_P(XEXP(op0, 1)) &&
+          is_a<scalar_int_mode>(GET_MODE(XEXP(op1, 0)), &int_mode) &&
+          GET_MODE_BITSIZE(int_mode) >= INTVAL(XEXP(op0, 1)))
+        op1 = XEXP(op1, 0);
+
+      if (CONST_INT_P(op1)) {
+        /* MOV immediate is assumed to always be cheap.  */
+        *cost = COSTS_N_INSNS(1);
+      } else {
+        /* BFM.  */
+        if (speed)
+          *cost += alu->cost(x, mode);
+        *cost += rtx_cost(op1, VOIDmode, (enum rtx_code)code, 1, speed);
+      }
+
+      return true;
+
+    default:
+      /* We can't make sense of this, assume default cost.  */
+      *cost = COSTS_N_INSNS(1);
+      return false;
+    }
+    return false;
+
+  case MEM:
+    if (speed) {
+      *cost += load->cost(x, mode);
+    }
+
+    return true;
+
+  case NEG:
+    op0 = XEXP(x, 0);
+
+    if (speed) {
+      /* FNEG.  */
+      *cost += alu->cost(x, mode);
+    }
+    return false;
+
+    if (GET_MODE_CLASS(mode) == MODE_INT) {
+      if (GET_RTX_CLASS(GET_CODE(op0)) == RTX_COMPARE ||
+          GET_RTX_CLASS(GET_CODE(op0)) == RTX_COMM_COMPARE) {
+        /* CSETM.  */
+        *cost += rtx_cost(XEXP(op0, 0), VOIDmode, NEG, 0, speed);
+        return true;
+      }
+
+      /* Cost this as SUB wzr, X.  */
+      op0 = CONST0_RTX(mode);
+      op1 = XEXP(x, 0);
+      goto cost_minus;
+    }
+    return false;
+
+  case COMPARE:
+    op0 = XEXP(x, 0);
+    op1 = XEXP(x, 1);
+
+    if (op1 == const0_rtx && GET_CODE(op0) == AND) {
+      x = op0;
+      mode = GET_MODE(op0);
+      goto cost_logic;
+    }
+
+    if (GET_MODE_CLASS(GET_MODE(op0)) == MODE_INT) {
+      /* TODO: A write to the CC flags possibly costs extra, this
+	 needs encoding in the cost tables.  */
+
+      mode = GET_MODE(op0);
+      /* ANDS.  */
+      if (GET_CODE(op0) == AND) {
+        x = op0;
+        goto cost_logic;
+      }
+
+      if (GET_CODE(op0) == PLUS) {
+        /* ADDS (and CMN alias).  */
+        x = op0;
+        goto cost_plus;
+      }
+
+      if (GET_CODE(op0) == MINUS) {
+        /* SUBS.  */
+        x = op0;
+        goto cost_minus;
+      }
+
+      if (GET_CODE(op0) == ZERO_EXTRACT && op1 == const0_rtx &&
+          CONST_INT_P(XEXP(op0, 1)) && CONST_INT_P(XEXP(op0, 2))) {
+        /* COMPARE of ZERO_EXTRACT form of TST-immediate.
+	   Handle it here directly rather than going to cost_logic
+	   since we know the immediate generated for the TST is valid
+	   so we can avoid creating an intermediate rtx for it only
+	   for costing purposes.  */
+        if (speed)
+          *cost += alu->cost(x, mode);
+
+        *cost += rtx_cost(XEXP(op0, 0), GET_MODE(op0), ZERO_EXTRACT, 0, speed);
+        return true;
+      }
+
+      if (GET_CODE(op1) == NEG) {
+        /* CMN.  */
+        if (speed)
+          *cost += alu->cost(x, mode);
+
+        *cost += rtx_cost(op0, mode, COMPARE, 0, speed);
+        *cost += rtx_cost(XEXP(op1, 0), mode, NEG, 1, speed);
+        return true;
+      }
+
+      /* CMP.
+
+	 Compare can freely swap the order of operands, and
+         canonicalization puts the more complex operation first.
+         But the integer MINUS logic expects the shift/extend
+         operation in op1.  */
+      if (!(REG_P(op0) || (SUBREG_P(op0) && REG_P(SUBREG_REG(op0))))) {
+        op0 = XEXP(x, 1);
+        op1 = XEXP(x, 0);
+      }
+      goto cost_minus;
+    }
+
+    if (VECTOR_MODE_P(mode)) {
+      /* Vector compare.  */
+      if (speed)
+        *cost += alu->cost(x, mode);
+
+      return false;
+    }
+    return false;
+
+  case MINUS: {
+    op0 = XEXP(x, 0);
+    op1 = XEXP(x, 1);
+
+    cost_minus:
+    *cost += rtx_cost(op0, mode, MINUS, 0, speed);
+
+    return true;
+  }
+
+  case PLUS: {
+    op0 = XEXP(x, 0);
+    op1 = XEXP(x, 1);
+
+    cost_plus:
+    if (GET_RTX_CLASS(GET_CODE(op0)) == RTX_COMPARE ||
+        GET_RTX_CLASS(GET_CODE(op0)) == RTX_COMM_COMPARE) {
+      /* CSINC.  */
+      *cost += rtx_cost(XEXP(op0, 0), mode, PLUS, 0, speed);
+      *cost += rtx_cost(op1, mode, PLUS, 1, speed);
+      return true;
+    }
+
+    *cost += rtx_cost(op1, mode, PLUS, 1, speed);
+
+    return true;
+  }
+
+  case BSWAP:
+    *cost = COSTS_N_INSNS(1);
+
+    if (speed) {
+      *cost += alu->cost(x, mode);
+    }
+    return false;
+
+  case IOR:
+    *cost = COSTS_N_INSNS(1);
+
+    if (speed) {
+      *cost += alu->cost(x, mode);
+    }
+    return true;
+
+  case XOR:
+  case AND:
+  cost_logic:
+    if (speed)
+      *cost += alu->cost(x, mode);
+    return true;
+
+  case NOT:
+    *cost += alu->cost(x, mode);
+    return false;
+
+  case ZERO_EXTEND:
+
+    op0 = XEXP(x, 0);
+    /* If a value is written in SI mode, then zero extended to DI
+       mode, the operation will in general be free as a write to
+       a 'w' register implicitly zeroes the upper bits of an 'x'
+       register.  However, if this is
+
+       (set (reg) (zero_extend (reg)))
+
+       we must cost the explicit register move.  */
+    if (mode == DImode && GET_MODE(op0) == SImode) {
+      int op_cost = rtx_cost(op0, VOIDmode, ZERO_EXTEND, 0, speed);
+
+      /* If OP_COST is non-zero, then the cost of the zero extend
+         is effectively the cost of the inner operation.  Otherwise
+         we have a MOV instruction and we take the cost from the MOV
+         itself.  This is true independently of whether we are
+         optimizing for space or time.  */
+      if (op_cost)
+        *cost = op_cost;
+
+      return true;
+    } else if (MEM_P(op0)) {
+      /* All loads can zero extend to any size for free.  */
+      *cost = rtx_cost(op0, VOIDmode, ZERO_EXTEND, 0, speed);
+      return true;
+    }
+
+    if (speed) {
+      /* UMOV.  */
+      *cost += alu->cost(x, mode);
+    }
+    return false;
+
+  case SIGN_EXTEND:
+    if (MEM_P(XEXP(x, 0))) {
+      if (speed) {
+        *cost += load->cost(x, mode);
+      }
+      return true;
+    }
+
+    if (speed) {
+      *cost += alu->cost(x, mode);
+    }
+    return false;
+
+  case ASHIFT:
+    op0 = XEXP(x, 0);
+    op1 = XEXP(x, 1);
+
+    if (CONST_INT_P(op1)) {
+      if (speed) {
+        *cost += alu->cost(x, mode);
+      }
+
+      /* We can incorporate zero/sign extend for free.  */
+      if (GET_CODE(op0) == ZERO_EXTEND || GET_CODE(op0) == SIGN_EXTEND)
+        op0 = XEXP(op0, 0);
+
+      *cost += rtx_cost(op0, VOIDmode, ASHIFT, 0, speed);
+      return true;
+    } else {
+      if (speed)
+        /* Vector shift (register).  */
+        *cost += alu->cost(x, mode);
+      return false; /* All arguments need to be in registers.  */
+    }
+
+  case ROTATE:
+  case ROTATERT:
+  case LSHIFTRT:
+  case ASHIFTRT:
+    op0 = XEXP(x, 0);
+    op1 = XEXP(x, 1);
+
+    if (CONST_INT_P(op1)) {
+      /* ASR (immediate) and friends.  */
+      if (speed) {
+        *cost += alu->cost(x, mode);
+      }
+
+      *cost += rtx_cost(op0, mode, (enum rtx_code)code, 0, speed);
+      return true;
+    } else {
+      if (VECTOR_MODE_P(mode)) {
+        if (speed)
+          /* Vector shift (register).  */
+          *cost += alu->cost(x, mode);
+      }
+      return false; /* All arguments need to be in registers.  */
+    }
+
+  case SYMBOL_REF:
+    return true;
+
+  case HIGH:
+  case LO_SUM:
+    /* ADRP/ADD (immediate).  */
+    if (speed)
+      *cost += alu->cost(x, mode);
+    return true;
+
+  case ZERO_EXTRACT:
+  case SIGN_EXTRACT:
+    /* UBFX/SBFX.  */
+    if (speed) {
+      *cost += alu->cost(x, mode);
+    }
+
+    /* We can trust that the immediates used will be correct (there
+       are no by-register forms), so we need only cost op0.  */
+    *cost += rtx_cost(XEXP(x, 0), VOIDmode, (enum rtx_code)code, 0, speed);
+    return true;
+
+  case MULT:
+    *cost += mult->cost(x, mode);
+    return true;
+
+  case MOD:
+  case UMOD:
+    if (speed) {
+      /* Slighly prefer UMOD over SMOD.  */
+      *cost += alu->cost(x, mode);
+    }
+    return false; /* All arguments need to be in registers.  */
+
+  case DIV:
+  case UDIV:
+  case SQRT:
+    if (speed) {
+      *cost += alu->cost(x, mode);
+    }
+    return false; /* All arguments need to be in registers.  */
+
+  case IF_THEN_ELSE:
+    if (speed) {
+      *cost += if_then_else->cost(x, mode);
+    }
+    return true;
+
+  case EQ:
+  case NE:
+  case GT:
+  case GTU:
+  case LT:
+  case LTU:
+  case GE:
+  case GEU:
+  case LE:
+  case LEU:
+
+    return false; /* All arguments must be in registers.  */
+
+  case FMA:
+    op0 = XEXP(x, 0);
+    op1 = XEXP(x, 1);
+    op2 = XEXP(x, 2);
+
+    if (speed) {
+      *cost += alu->cost(x, mode);
+    }
+
+    /* FMSUB, FNMADD, and FNMSUB are free.  */
+    if (GET_CODE(op0) == NEG)
+      op0 = XEXP(op0, 0);
+
+    if (GET_CODE(op2) == NEG)
+      op2 = XEXP(op2, 0);
+
+    /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
+       and the by-element operand as operand 0.  */
+    if (GET_CODE(op1) == NEG)
+      op1 = XEXP(op1, 0);
+
+    /* Catch vector-by-element operations.  The by-element operand can
+       either be (vec_duplicate (vec_select (x))) or just
+       (vec_select (x)), depending on whether we are multiplying by
+       a vector or a scalar.
+
+       Canonicalization is not very good in these cases, FMA4 will put the
+       by-element operand as operand 0, FNMA4 will have it as operand 1.  */
+    if (GET_CODE(op0) == VEC_DUPLICATE)
+      op0 = XEXP(op0, 0);
+    else if (GET_CODE(op1) == VEC_DUPLICATE)
+      op1 = XEXP(op1, 0);
+
+    if (GET_CODE(op0) == VEC_SELECT)
+      op0 = XEXP(op0, 0);
+    else if (GET_CODE(op1) == VEC_SELECT)
+      op1 = XEXP(op1, 0);
+
+    /* If the remaining parameters are not registers,
+       get the cost to put them into registers.  */
+    *cost += rtx_cost(op0, mode, FMA, 0, speed);
+    *cost += rtx_cost(op1, mode, FMA, 1, speed);
+    *cost += rtx_cost(op2, mode, FMA, 2, speed);
+    return true;
+
+  case FLOAT:
+  case UNSIGNED_FLOAT:
+    return false;
+
+  case FLOAT_EXTEND:
+    if (speed) {
+      /*Vector truncate.  */
+      *cost += alu->cost(x, mode);
+    }
+    return false;
+
+  case FLOAT_TRUNCATE:
+    if (speed) {
+      /*Vector conversion.  */
+      *cost += alu->cost(x, mode);
+    }
+    return false;
+
+  case FIX:
+  case UNSIGNED_FIX:
+    x = XEXP(x, 0);
+    if (speed) {
+      *cost += alu->cost(x, mode);
+    }
+
+    *cost += rtx_cost(x, VOIDmode, (enum rtx_code)code, 0, speed);
+    return true;
+
+  case ABS:
+    /* ABS (vector).  */
+    if (speed)
+      *cost += alu->cost(x, mode);
+    return false;
+
+  case SMAX:
+  case SMIN:
+    if (speed) {
+      *cost += alu->cost(x, mode);
+    }
+    return false;
+
+  case UNSPEC:
+    break;
+
+  case TRUNCATE:
+    break;
+  case CONST_VECTOR: {
+    *cost = mov->cost(x, mode);
+    break;
+  }
+  case VEC_CONCAT:
+    /* depending on the operation, either DUP or INS.
+       For now, keep default costing.  */
+    break;
+  case VEC_DUPLICATE:
+    /* Load using a DUP.  */
+    *cost = dup->cost(x, mode);
+    return false;
+  case VEC_SELECT: {
+    rtx op0 = XEXP(x, 0);
+    *cost = rtx_cost(op0, GET_MODE(op0), VEC_SELECT, 0, speed);
+
+    /* cost subreg of 0 as free, otherwise as DUP */
+    rtx op1 = XEXP(x, 1);
+    if (vec_series_lowpart_p(mode, GET_MODE(op1), op1))
+      ;
+    else if (vec_series_highpart_p(mode, GET_MODE(op1), op1))
+      *cost = dup->cost(x, mode);
+    else
+      *cost = extract->cost(x, mode);
+    return true;
+  }
+  default:
+    break;
+  }
+
+  if (dump_file)
+    fprintf(dump_file, "\nFailed to cost RTX.  Assuming default cost.\n");
+
+  return true;
+}
+
+extern int riscv_builtin_vectorization_cost (enum vect_cost_for_stmt, tree, int);
+
+riscv_vector_costs::riscv_vector_costs(vec_info *vinfo, bool costing_for_scalar)
+  : vector_costs(vinfo, costing_for_scalar) {}
+
+unsigned riscv_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
+					    stmt_vec_info stmt_info, slp_tree,
+					    tree vectype, int misalign,
+					    vect_cost_model_location where) {
+  int stmt_cost
+    = riscv_builtin_vectorization_cost (kind, vectype, misalign);
+  return record_stmt_cost(stmt_info, where, count * stmt_cost);
+}
+
+void riscv_vector_costs::finish_cost(const vector_costs *uncast_scalar_costs) {
+  auto *scalar_costs =
+    static_cast<const riscv_vector_costs *>(uncast_scalar_costs);
+  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info>(m_vinfo);
+  if (loop_vinfo)
+    m_costs[vect_body] = 1;
+  vector_costs::finish_cost(scalar_costs);
+}
+
+bool riscv_vector_costs::better_main_loop_than_p(
+						 const vector_costs *uncast_other) const {
+  auto other = static_cast<const riscv_vector_costs *>(uncast_other);
+
+  return vector_costs::better_main_loop_than_p(other);
+}
diff --git a/gcc/config/riscv/riscv-vector-cost.h b/gcc/config/riscv/riscv-vector-cost.h
new file mode 100644
index 00000000000..ef398915a18
--- /dev/null
+++ b/gcc/config/riscv/riscv-vector-cost.h
@@ -0,0 +1,400 @@
+/* Cost model definitions for RISC-V 'V' Extension for GNU compiler.
+   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+   Contributed by Juzhe Zhong (juzhe.zh...@rivai.ai), RiVAI Technologies Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_RISCV_VECTOR_COST_H
+#define GCC_RISCV_VECTOR_COST_H
+
+enum vector_tune_type {
+  VECTOR_TUNE_GENERIC,
+};
+
+struct vector_insn_scale_table {
+  const int load;
+  const int store;
+  const int alu;
+  const int mult;
+  const int mov;
+  const int dup;
+  const int extract;
+  const int if_then_else;
+};
+
+struct vector_stmt_scale_table {
+  const int scalar_int_stmt_cost;       /* Cost of any int scalar operation,
+                                         excluding load and store.  */
+  const int scalar_fp_stmt_cost;        /* Cost of any fp scalar operation,
+                                         excluding load and store.  */
+  const int scalar_load_cost;           /* Cost of scalar load.  */
+  const int scalar_store_cost;          /* Cost of scalar store.  */
+  const int vec_int_stmt_cost;          /* Cost of any int vector operation,
+                                         excluding load, store, permute,
+                                         vector-to-scalar and
+                                         scalar-to-vector operation.  */
+  const int vec_fp_stmt_cost;           /* Cost of any fp vector operation,
+                                         excluding load, store, permute,
+                                         vector-to-scalar and
+                                         scalar-to-vector operation.  */
+  const int vec_permute_cost;           /* Cost of permute operation.  */
+  const int vec_to_scalar_cost;         /* Cost of vec-to-scalar operation.  */
+  const int scalar_to_vec_cost;         /* Cost of scalar-to-vector
+                                         operation.  */
+  const int vec_align_load_cost;        /* Cost of aligned vector load.  */
+  const int vec_unalign_load_cost;      /* Cost of unaligned vector load.  */
+  const int vec_unalign_store_cost;     /* Cost of unaligned vector store.  */
+  const int vec_store_cost;             /* Cost of vector store.  */
+  const int cond_taken_branch_cost;     /* Cost of taken branch.  */
+  const int cond_not_taken_branch_cost; /* Cost of not taken branch.  */
+};
+
+/* Information about vector code that we're in the process of costing.  */
+class riscv_vector_costs : public vector_costs {
+public:
+  riscv_vector_costs(vec_info *, bool);
+
+  unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
+			      stmt_vec_info stmt_info, slp_tree, tree vectype,
+			      int misalign,
+			      vect_cost_model_location where) override;
+  void finish_cost(const vector_costs *) override;
+  bool better_main_loop_than_p(const vector_costs *other) const override;
+};
+
+template <typename T> class vector_insn_cost {
+public:
+  vector_insn_cost(const T *_scale_table) : m_scale_table(_scale_table) {}
+  ~vector_insn_cost() {}
+
+  virtual int scale(RTX_CODE) const { return 1; }
+
+  virtual unsigned cost(rtx x, machine_mode mode) const {
+    return riscv_vector::riscv_classify_nf(mode) * riscv_vector::riscv_vlmul_regsize(mode) *
+           scale(x == NULL_RTX ? UNKNOWN : GET_CODE(x));
+  }
+
+protected:
+  const T *m_scale_table;
+};
+
+template <typename T> class vector_cost_table {
+public:
+  vector_cost_table(const T *) {}
+  ~vector_cost_table() {}
+
+  virtual bool get_cost(rtx, machine_mode, int *, bool) const { return 1; }
+};
+
+class vector_alu_cost : public vector_insn_cost<vector_insn_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override { return m_scale_table->alu; }
+};
+
+class vector_load_cost : public vector_insn_cost<vector_insn_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override { return m_scale_table->load; }
+};
+
+class vector_store_cost : public vector_insn_cost<vector_insn_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override { return m_scale_table->store; }
+};
+
+class vector_mult_cost : public vector_insn_cost<vector_insn_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override { return m_scale_table->mult; }
+};
+
+class vector_mov_cost : public vector_insn_cost<vector_insn_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override { return m_scale_table->mov; }
+};
+
+class vector_dup_cost : public vector_insn_cost<vector_insn_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override { return m_scale_table->dup; }
+};
+
+class vector_extract_cost : public vector_insn_cost<vector_insn_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override { return m_scale_table->extract; }
+};
+
+class vector_if_then_else_cost
+    : public vector_insn_cost<vector_insn_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->if_then_else;
+  }
+};
+
+class vector_insn_cost_table
+    : public vector_cost_table<vector_insn_scale_table> {
+public:
+  vector_insn_cost_table(const vector_insn_scale_table *_scale_table)
+      : vector_cost_table(_scale_table) {
+    load = new vector_load_cost(_scale_table);
+    store = new vector_store_cost(_scale_table);
+    alu = new vector_alu_cost(_scale_table);
+    mult = new vector_mult_cost(_scale_table);
+    mov = new vector_mov_cost(_scale_table);
+    dup = new vector_dup_cost(_scale_table);
+    extract = new vector_extract_cost(_scale_table);
+    if_then_else = new vector_if_then_else_cost(_scale_table);
+  }
+
+  bool get_cost(rtx, machine_mode, int *, bool) const override;
+
+public:
+  const vector_insn_cost<vector_insn_scale_table> *load;
+  const vector_insn_cost<vector_insn_scale_table> *store;
+  const vector_insn_cost<vector_insn_scale_table> *alu;
+  const vector_insn_cost<vector_insn_scale_table> *mult;
+  const vector_insn_cost<vector_insn_scale_table> *mov;
+  const vector_insn_cost<vector_insn_scale_table> *dup;
+  const vector_insn_cost<vector_insn_scale_table> *extract;
+  const vector_insn_cost<vector_insn_scale_table> *if_then_else;
+};
+
+// ==================== vector stmt cost=========================
+class vector_scalar_int_cost
+    : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->scalar_int_stmt_cost;
+  }
+};
+
+class vector_scalar_fp_cost : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->scalar_fp_stmt_cost;
+  }
+};
+
+class vector_scalar_load_cost
+    : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->scalar_load_cost;
+  }
+};
+
+class vector_scalar_store_cost
+    : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->scalar_store_cost;
+  }
+};
+
+class vector_vec_int_cost : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->vec_int_stmt_cost;
+  }
+};
+
+class vector_vec_fp_cost : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->vec_fp_stmt_cost;
+  }
+};
+
+class vector_vec_permute_cost
+    : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->vec_permute_cost;
+  }
+};
+
+class vector_vec_to_scalar_cost
+    : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->vec_to_scalar_cost;
+  }
+};
+
+class vector_scalar_to_vec_cost
+    : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->scalar_to_vec_cost;
+  }
+};
+
+class vector_vec_align_load_cost
+    : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->vec_align_load_cost;
+  }
+};
+
+class vector_vec_unalign_load_cost
+    : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->vec_unalign_load_cost;
+  }
+};
+
+class vector_vec_unalign_store_cost
+    : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->vec_unalign_store_cost;
+  }
+};
+
+class vector_vec_store_cost : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->vec_store_cost;
+  }
+};
+
+class vector_cond_taken_branch_cost
+    : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->cond_taken_branch_cost;
+  }
+};
+
+class vector_cond_not_taken_branch_cost
+    : public vector_insn_cost<vector_stmt_scale_table> {
+public:
+  // use the same construction function as the vector_insn_cost
+  using vector_insn_cost::vector_insn_cost;
+
+  int scale(RTX_CODE) const override {
+    return m_scale_table->cond_not_taken_branch_cost;
+  }
+};
+
+class vector_stmt_cost_table
+    : public vector_cost_table<vector_stmt_scale_table> {
+public:
+  vector_stmt_cost_table(const vector_stmt_scale_table *_scale_table)
+      : vector_cost_table(_scale_table) {
+    scalar_int = new vector_scalar_int_cost(_scale_table);
+    scalar_fp = new vector_scalar_fp_cost(_scale_table);
+    scalar_load = new vector_scalar_load_cost(_scale_table);
+    scalar_store = new vector_scalar_store_cost(_scale_table);
+    vec_int = new vector_vec_int_cost(_scale_table);
+    vec_fp = new vector_vec_fp_cost(_scale_table);
+    vec_permute = new vector_vec_permute_cost(_scale_table);
+    vec_to_scalar = new vector_vec_to_scalar_cost(_scale_table);
+    scalar_to_vec = new vector_scalar_to_vec_cost(_scale_table);
+    vec_align_load = new vector_vec_align_load_cost(_scale_table);
+    vec_unalign_load = new vector_vec_unalign_load_cost(_scale_table);
+    vec_unalign_store = new vector_vec_unalign_store_cost(_scale_table);
+    vec_store = new vector_vec_store_cost(_scale_table);
+    cond_taken_branch = new vector_cond_taken_branch_cost(_scale_table);
+    cond_not_taken_branch = new vector_cond_not_taken_branch_cost(_scale_table);
+  }
+
+public:
+  const vector_insn_cost<vector_stmt_scale_table> *scalar_int;
+  const vector_insn_cost<vector_stmt_scale_table> *scalar_fp;
+  const vector_insn_cost<vector_stmt_scale_table> *scalar_load;
+  const vector_insn_cost<vector_stmt_scale_table> *scalar_store;
+  const vector_insn_cost<vector_stmt_scale_table> *vec_int;
+  const vector_insn_cost<vector_stmt_scale_table> *vec_fp;
+  const vector_insn_cost<vector_stmt_scale_table> *vec_permute;
+  const vector_insn_cost<vector_stmt_scale_table> *vec_to_scalar;
+  const vector_insn_cost<vector_stmt_scale_table> *scalar_to_vec;
+  const vector_insn_cost<vector_stmt_scale_table> *vec_align_load;
+  const vector_insn_cost<vector_stmt_scale_table> *vec_unalign_load;
+  const vector_insn_cost<vector_stmt_scale_table> *vec_unalign_store;
+  const vector_insn_cost<vector_stmt_scale_table> *vec_store;
+  const vector_insn_cost<vector_stmt_scale_table> *cond_taken_branch;
+  const vector_insn_cost<vector_stmt_scale_table> *cond_not_taken_branch;
+};
+
+#endif // GCC_RISCV_VECTOR_COST_H
diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index d30e0235356..095169741bb 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -51,6 +51,11 @@ riscv-c.o: $(srcdir)/config/riscv/riscv-c.cc $(CONFIG_H) $(SYSTEM_H) \
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/riscv/riscv-c.cc
 
+riscv-vector-cost.o: $(srcdir)/config/riscv/riscv-vector-cost.cc $(CONFIG_H) $(SYSTEM_H) \
+    coretypes.h $(TM_H) $(TREE_H) output.h $(C_COMMON_H) $(TARGET_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/riscv/riscv-vector-cost.cc
+
 riscv-vsetvl.o: $(srcdir)/config/riscv/riscv-vsetvl.cc \
   $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(REGS_H) \
   $(TARGET_H) tree-pass.h df.h rtl-ssa.h cfgcleanup.h insn-config.h \
-- 
2.34.1

[PATCH 03/07] RISC-V: Add auto-vectorization support

Reply via email to