Implementation of the new RISC-V optimization pass for memory offset
calculations, documentation and testcases.

gcc/ChangeLog:

        * config.gcc: Add riscv-fold-mem-offsets.o to extra_objs.
        * config/riscv/riscv-passes.def (INSERT_PASS_AFTER): Schedule a new
        pass.
        * config/riscv/riscv-protos.h (make_pass_fold_mem_offsets): Declare.
        * config/riscv/riscv.opt: New options.
        * config/riscv/t-riscv: New build rule.
        * doc/invoke.texi: Document new option.
        * config/riscv/riscv-fold-mem-offsets.cc: New file.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/fold-mem-offsets-1.c: New test.
        * gcc.target/riscv/fold-mem-offsets-2.c: New test.
        * gcc.target/riscv/fold-mem-offsets-3.c: New test.

Signed-off-by: Manolis Tsamis <manolis.tsa...@vrull.eu>
---

 gcc/config.gcc                                |   2 +-
 gcc/config/riscv/riscv-fold-mem-offsets.cc    | 637 ++++++++++++++++++
 gcc/config/riscv/riscv-passes.def             |   1 +
 gcc/config/riscv/riscv-protos.h               |   1 +
 gcc/config/riscv/riscv.opt                    |   4 +
 gcc/config/riscv/t-riscv                      |   4 +
 gcc/doc/invoke.texi                           |   8 +
 .../gcc.target/riscv/fold-mem-offsets-1.c     |  16 +
 .../gcc.target/riscv/fold-mem-offsets-2.c     |  24 +
 .../gcc.target/riscv/fold-mem-offsets-3.c     |  17 +
 10 files changed, 713 insertions(+), 1 deletion(-)
 create mode 100644 gcc/config/riscv/riscv-fold-mem-offsets.cc
 create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c

diff --git a/gcc/config.gcc b/gcc/config.gcc
index d88071773c9..5dffd21b4c8 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -529,7 +529,7 @@ pru-*-*)
        ;;
 riscv*)
        cpu_type=riscv
-       extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o 
riscv-shorten-memrefs.o riscv-selftests.o riscv-v.o riscv-vsetvl.o"
+       extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o 
riscv-shorten-memrefs.o riscv-fold-mem-offsets.o riscv-selftests.o riscv-v.o 
riscv-vsetvl.o"
        extra_objs="${extra_objs} riscv-vector-builtins.o 
riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o"
        extra_objs="${extra_objs} thead.o"
        d_target_objs="riscv-d.o"
diff --git a/gcc/config/riscv/riscv-fold-mem-offsets.cc 
b/gcc/config/riscv/riscv-fold-mem-offsets.cc
new file mode 100644
index 00000000000..81325bb3beb
--- /dev/null
+++ b/gcc/config/riscv/riscv-fold-mem-offsets.cc
@@ -0,0 +1,637 @@
+/* Fold memory offsets pass for RISC-V.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "rtl.h"
+#include "tree.h"
+#include "expr.h"
+#include "backend.h"
+#include "regs.h"
+#include "target.h"
+#include "memmodel.h"
+#include "emit-rtl.h"
+#include "insn-config.h"
+#include "recog.h"
+#include "predict.h"
+#include "df.h"
+#include "tree-pass.h"
+#include "cfgrtl.h"
+
+/* This pass tries to optimize memory offset calculations by moving them
+   from add immediate instructions to the memory loads/stores.
+   For example it can transform this:
+
+     addi t4,sp,16
+     add  t2,a6,t4
+     shl  t3,t2,1
+     ld   a2,0(t3)
+     addi a2,1
+     sd   a2,8(t2)
+
+   into the following (one instruction less):
+
+     add  t2,a6,sp
+     shl  t3,t2,1
+     ld   a2,32(t3)
+     addi a2,1
+     sd   a2,24(t2)
+
+   Usually, the code generated from the previous passes tries to have the
+   offsets in the memory instructions but this pass is still beneficial
+   because:
+
+    - There are cases where add instructions are added in a late rtl pass
+      and the rest of the pipeline cannot eliminate them.  Specifically,
+      arrays and structs allocated on the stack can result in multiple
+      unnecessary add instructions that cannot be eliminated easily
+      otherwise.
+
+    - The existing mechanisms that move offsets to memory instructions
+      usually apply only to specific patterns or have other limitations.
+      This pass is very generic and can fold offsets through complex
+      calculations with multiple memory uses and partially overlapping
+      calculations.  As a result it can eliminate more instructions than
+      what is possible otherwise.
+
+   This pass runs inside a single basic blocks and consists of 4 phases:
+
+    - Phase 1 (Analysis): Find "foldable" instructions.
+      Foldable instructions are those that we know how to propagate
+      a constant addition through (add, slli, mv, ...) and only have other
+      foldable instructions for uses.  In that phase a DFS traversal on the
+      definition tree is performed and foldable instructions are marked on
+      a bitmap.  The add immediate instructions that are reachable in this
+      DFS are candidates for removal since all the intermediate
+      calculations affected by them are also foldable.
+
+    - Phase 2 (Validity): Traverse again, this time calculating the
+      offsets that would result from folding all add immediate instructions
+      found.  Also keep track of which instructions will be folded for this
+      particular offset because folding can be partially or completely
+      shared across an number of different memory instructions.  At this point,
+      since we calculated the actual offset resulting from folding, we check
+      and keep track if it's a valid 12-bit immediate.
+
+    - Phase 3 (Commit offsets): Traverse again.  This time it is known if
+      a particular fold is valid so actually fold the offset by changing
+      the RTL statement.  It's important that this phase is separate from the
+      previous because one instruction that is foldable with a valid offset
+      can become result in an invalid offset for another instruction later on.
+
+    - Phase 4 (Commit instruction deletions): Scan all insns and delete
+      all add immediate instructions that were folded.  */
+
+namespace {
+
+const pass_data pass_data_fold_mem =
+{
+  RTL_PASS, /* type */
+  "fold_mem_offsets", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_fold_mem_offsets : public rtl_opt_pass
+{
+public:
+  pass_fold_mem_offsets (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_fold_mem, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return riscv_mfold_mem_offsets
+              && optimize >= 2;
+    }
+
+  virtual unsigned int execute (function *);
+}; // class pass_fold_mem_offsets
+
+/* Bitmap that tracks which instructions are reachable through sequences
+   of foldable instructions.  */
+static bitmap_head can_fold_insn;
+
+/* Bitmap with instructions marked for deletion due to folding.  */
+static bitmap_head pending_remove_insn;
+
+/* Bitmap with instructions that cannot be deleted because that would
+   require folding an offset that's invalid in some memory access.
+   An instruction can be in both PENDING_REMOVE_INSN and CANNOT_REMOVE_INSN
+   at the same time, in which case it cannot be safely deleted.  */
+static bitmap_head cannot_remove_insn;
+
+/* The number of folded addi instructions of the form "addi reg, sp, X".  */
+static int stats_folded_sp;
+
+/* The number of the rest folded addi instructions.  */
+static int stats_folded_other;
+
+enum fold_mem_phase
+{
+  FM_PHASE_ANALYSIS,
+  FM_PHASE_VALIDITY,
+  FM_PHASE_COMMIT_OFFSETS,
+  FM_PHASE_COMMIT_INSNS
+};
+
+/* Helper function for fold_offsets.
+  Get the single reaching definition of an instruction inside a BB.
+  The definition is desired for REG used in INSN.
+  Return the definition insn or NULL if there's no definition with
+  the desired criteria.  */
+static rtx_insn*
+get_single_def_in_bb (rtx_insn *insn, rtx reg)
+{
+  df_ref use;
+  struct df_link *ref_chain, *ref_link;
+
+  FOR_EACH_INSN_USE (use, insn)
+    {
+      if (GET_CODE (DF_REF_REG (use)) == SUBREG)
+       return NULL;
+      if (REGNO (DF_REF_REG (use)) == REGNO (reg))
+       break;
+    }
+
+  if (!use)
+    return NULL;
+
+  ref_chain = DF_REF_CHAIN (use);
+
+  if (!ref_chain)
+    return NULL;
+
+  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
+    {
+      /* Problem getting some definition for this instruction.  */
+      if (ref_link->ref == NULL)
+       return NULL;
+      if (DF_REF_INSN_INFO (ref_link->ref) == NULL)
+       return NULL;
+      if (global_regs[REGNO (reg)]
+         && !set_of (reg, DF_REF_INSN (ref_link->ref)))
+       return NULL;
+    }
+
+  if (ref_chain->next)
+    return NULL;
+
+  rtx_insn* def = DF_REF_INSN (ref_chain->ref);
+
+  if (BLOCK_FOR_INSN (def) != BLOCK_FOR_INSN (insn))
+    return NULL;
+
+  if (DF_INSN_LUID (def) > DF_INSN_LUID (insn))
+    return NULL;
+
+  return def;
+}
+
+/* Helper function for fold_offsets.
+   Get all the reaching uses of an instruction.  The uses are desired for REG
+   set in INSN.  Return use list or NULL if a use is missing or irregular.
+   If SUCCESS is not NULL then it's value is set to false if there are
+   missing or irregular uses and to true otherwise.  */
+static struct df_link*
+get_uses (rtx_insn *insn, rtx reg, bool* success)
+{
+  df_ref def;
+  struct df_link *ref_chain, *ref_link;
+
+  if (success != NULL)
+    *success = false;
+
+  FOR_EACH_INSN_DEF (def, insn)
+    if (REGNO (DF_REF_REG (def)) == REGNO (reg))
+      break;
+
+  if (!def)
+    return NULL;
+
+  ref_chain = DF_REF_CHAIN (def);
+
+  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
+    {
+      /* Problem getting some use for this instruction.  */
+      if (ref_link->ref == NULL)
+       return NULL;
+      if (DF_REF_CLASS (ref_link->ref) != DF_REF_REGULAR)
+       return NULL;
+    }
+
+  if (success != NULL)
+    *success = true;
+
+  return ref_chain;
+}
+
+/* Recursive function that computes the foldable offsets through the
+   definitions of REG in INSN given an integer scale factor SCALE.
+   Returns the offset that would have to be added if all instructions
+   in PENDING_DELETES were to be deleted.
+
+  - if ANALYZE is true then it recurses through definitions with the common
+    code and marks eligible for folding instructions in the bitmap
+    can_fold_insn.  An instruction is eligible if all it's uses are also
+    eligible.  Initially can_fold_insn is true for memory accesses.
+
+  - if ANALYZE is false then it recurses through definitions with the common
+    code and computes and returns the offset that would result from folding
+    the instructions in PENDING_DELETES were to be deleted.  */
+static HOST_WIDE_INT
+fold_offsets (rtx_insn* insn, rtx reg, int scale, bool analyze,
+             bitmap pending_deletes)
+{
+  rtx_insn* def = get_single_def_in_bb (insn, reg);
+
+  if (!def)
+    return 0;
+
+  rtx set = single_set (def);
+
+  if (!set)
+    return 0;
+
+  rtx src = SET_SRC (set);
+  rtx dest = SET_DEST (set);
+
+  enum rtx_code code = GET_CODE (src);
+
+  /* Return early for SRC codes that we don't know how to handle.  */
+  if (code != PLUS && code != ASHIFT && code != REG)
+    return 0;
+
+  unsigned int dest_regno = REGNO (dest);
+
+  /* We don't want to fold offsets from instructions that change some
+     particular registers with potentially global side effects.  */
+  if (!GP_REG_P (dest_regno)
+      || dest_regno == STACK_POINTER_REGNUM
+      || (frame_pointer_needed && dest_regno == HARD_FRAME_POINTER_REGNUM)
+      || dest_regno == GP_REGNUM
+      || dest_regno == THREAD_POINTER_REGNUM
+      || dest_regno == RETURN_ADDR_REGNUM)
+    return 0;
+
+  if (analyze)
+    {
+      /* We can only fold through instructions that are eventually used as
+        memory addresses and do not have other uses.  Use the same logic
+        from the offset calculation to visit instructions that can
+        propagate offsets and keep track in can_fold_insn which have uses
+        that end always in memory instructions.  */
+
+      if (REG_P (dest))
+       {
+         bool success;
+         struct df_link *uses = get_uses (def, dest, &success), *ref_link;
+
+         if (!success)
+           return 0;
+
+         for (ref_link = uses; ref_link; ref_link = ref_link->next)
+           {
+             rtx_insn* use = DF_REF_INSN (ref_link->ref);
+
+             /* Ignore debug insns during analysis.  */
+             if (DEBUG_INSN_P (use))
+               continue;
+
+             if (!bitmap_bit_p (&can_fold_insn, INSN_UID (use)))
+               return 0;
+
+             rtx use_set = single_set (use);
+
+             /* Prevent folding when a memory store uses the dest register.  */
+             if (use_set
+                 && MEM_P (SET_DEST (use_set))
+                 && REG_P (SET_SRC (use_set))
+                 && REGNO (SET_SRC (use_set)) == REGNO (dest))
+               return 0;
+           }
+
+         bitmap_set_bit (&can_fold_insn, INSN_UID (def));
+       }
+    }
+
+  if (!bitmap_bit_p (&can_fold_insn, INSN_UID (def)))
+    return 0;
+
+  switch (code)
+    {
+    case PLUS:
+      {
+       /* Propagate through add.  */
+       rtx arg1 = XEXP (src, 0);
+       rtx arg2 = XEXP (src, 1);
+
+       HOST_WIDE_INT offset = 0;
+
+       if (REG_P (arg1))
+         offset += fold_offsets (def, arg1, 1, analyze, pending_deletes);
+       else if (GET_CODE (arg1) == ASHIFT && REG_P (XEXP (arg1, 0))
+                && CONST_INT_P (XEXP (arg1, 1)))
+         {
+           /* Also handle shift-and-add from the zbb extension.  */
+           int shift_scale = (1 << (int) INTVAL (XEXP (arg1, 1)));
+           offset += fold_offsets (def, XEXP (arg1, 0), shift_scale, analyze,
+                                   pending_deletes);
+         }
+
+       if (REG_P (arg2))
+         offset += fold_offsets (def, arg2, 1, analyze, pending_deletes);
+       else if (CONST_INT_P (arg2) && !analyze)
+         {
+           offset += INTVAL (arg2);
+           bitmap_set_bit (pending_deletes, INSN_UID (def));
+         }
+
+       return scale * offset;
+      }
+    case ASHIFT:
+      {
+       /* Propagate through sll.  */
+       rtx arg1 = XEXP (src, 0);
+       rtx arg2 = XEXP (src, 1);
+
+       if (REG_P (arg1) && CONST_INT_P (arg2))
+         {
+           int shift_scale = (1 << (int) INTVAL (arg2));
+           return scale * fold_offsets (def, arg1, shift_scale, analyze,
+                                        pending_deletes);
+         }
+
+       return 0;
+      }
+    case REG:
+      /* Propagate through mv.  */
+      return scale * fold_offsets (def, src, 1, analyze, pending_deletes);
+    default:
+      /* Cannot propagate.  */
+      return 0;
+    }
+}
+
+/* Helper function for fold_offset_mem.
+   If INSN is a set rtx that loads from or stores to
+   some memory location that could have an offset folded
+   to it, return the rtx for the memory operand.  */
+static rtx
+get_foldable_mem_rtx (rtx_insn* insn)
+{
+  rtx set = single_set (insn);
+
+  if (set != NULL_RTX)
+    {
+      rtx src = SET_SRC (set);
+      rtx dest = SET_DEST (set);
+
+      /* We don't want folding if the memory has
+        unspec/unspec volatile in either src or dest.
+        In particular this also prevents folding
+        when atomics are involved.  */
+      if (GET_CODE (src) == UNSPEC
+         || GET_CODE (src) == UNSPEC_VOLATILE
+         || GET_CODE (dest) == UNSPEC
+         || GET_CODE (dest) == UNSPEC_VOLATILE)
+       return NULL;
+
+      if (MEM_P (src))
+       return src;
+      else if (MEM_P (dest))
+       return dest;
+      else if ((
+               GET_CODE (src) == SIGN_EXTEND
+               || GET_CODE (src) == ZERO_EXTEND
+             )
+             && MEM_P (XEXP (src, 0)))
+       return XEXP (src, 0);
+    }
+
+  return NULL;
+}
+
+/* Driver function that performs the actions defined by PHASE for INSN.  */
+static void
+fold_offset_mem (rtx_insn* insn, int phase)
+{
+  if (phase == FM_PHASE_COMMIT_INSNS)
+    {
+      if (bitmap_bit_p (&pending_remove_insn, INSN_UID (insn))
+         && !bitmap_bit_p (&cannot_remove_insn, INSN_UID (insn)))
+       {
+         rtx set = single_set (insn);
+         rtx src = SET_SRC (set);
+         rtx dest = SET_DEST (set);
+         rtx arg1 = XEXP (src, 0);
+
+         /* INSN is an add immidiate addi DEST, SRC1, SRC2 that we
+            must replace with addi DEST, SRC1, 0.  */
+         if (XEXP (src, 0) == stack_pointer_rtx)
+           stats_folded_sp++;
+         else
+           stats_folded_other++;
+
+         if (dump_file)
+           {
+             fprintf (dump_file, "Instruction deleted from folding:");
+             print_rtl_single (dump_file, insn);
+           }
+
+         if (REGNO (dest) != REGNO (arg1))
+           {
+             /* If the dest register is different than the fisrt argument
+                then the addition with constant 0 is equivalent to a move
+                instruction.  We emit the move and let the subsequent
+                pass cprop_hardreg eliminate that if possible.  */
+             rtx arg1_reg_rtx = gen_rtx_REG (GET_MODE (dest), REGNO (arg1));
+             rtx mov_rtx = gen_move_insn (dest, arg1_reg_rtx);
+             df_insn_rescan (emit_insn_after (mov_rtx, insn));
+           }
+
+         /* If the dest register is the same with the first argument
+            then the addition with constant 0 is a no-op.
+            We can now delete the original add immidiate instruction.  */
+         delete_insn (insn);
+       }
+    }
+  else
+    {
+      rtx mem = get_foldable_mem_rtx (insn);
+
+      if (!mem)
+       return;
+
+      rtx mem_addr = XEXP (mem, 0);
+      rtx reg;
+      HOST_WIDE_INT cur_off;
+
+      if (REG_P (mem_addr))
+       {
+         reg = mem_addr;
+         cur_off = 0;
+       }
+      else if (GET_CODE (mem_addr) == PLUS
+              && REG_P (XEXP (mem_addr, 0))
+              && CONST_INT_P (XEXP (mem_addr, 1)))
+       {
+         reg = XEXP (mem_addr, 0);
+         cur_off = INTVAL (XEXP (mem_addr, 1));
+       }
+      else
+       return;
+
+      if (phase == FM_PHASE_ANALYSIS)
+       {
+         bitmap_set_bit (&can_fold_insn, INSN_UID (insn));
+         fold_offsets (insn, reg, 1, true, NULL);
+       }
+      else if (phase == FM_PHASE_VALIDITY)
+       {
+         bitmap_head new_pending_deletes;
+         bitmap_initialize (&new_pending_deletes, NULL);
+         HOST_WIDE_INT offset = cur_off + fold_offsets (insn, reg, 1, false,
+                                                       &new_pending_deletes);
+
+         /* Temporarily change the offset in MEM to test whether
+            it results in a valid instruction.  */
+         machine_mode mode = GET_MODE (mem_addr);
+         XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
+
+         bool valid_change = recog (PATTERN (insn), insn, 0) >= 0;
+
+         /* Restore the instruction.  */
+         XEXP (mem, 0) = mem_addr;
+
+         if (valid_change)
+           bitmap_ior_into (&pending_remove_insn, &new_pending_deletes);
+         else
+           bitmap_ior_into (&cannot_remove_insn, &new_pending_deletes);
+         bitmap_release (&new_pending_deletes);
+       }
+      else if (phase == FM_PHASE_COMMIT_OFFSETS)
+       {
+         bitmap_head required_deletes;
+         bitmap_initialize (&required_deletes, NULL);
+         HOST_WIDE_INT offset = cur_off + fold_offsets (insn, reg, 1, false,
+                                                        &required_deletes);
+         bool illegal = bitmap_intersect_p (&required_deletes,
+                                            &cannot_remove_insn);
+
+         if (offset == cur_off)
+           return;
+
+         gcc_assert (!bitmap_empty_p (&required_deletes));
+
+         /* We have to update CANNOT_REMOVE_INSN again if transforming
+            this instruction is illegal.  */
+         if (illegal)
+           bitmap_ior_into (&cannot_remove_insn, &required_deletes);
+         else
+           {
+             machine_mode mode = GET_MODE (mem_addr);
+             XEXP (mem, 0) = gen_rtx_PLUS (mode, reg, GEN_INT (offset));
+             df_insn_rescan (insn);
+
+             if (dump_file)
+               {
+                 fprintf (dump_file, "Memory offset changed from "
+                                     HOST_WIDE_INT_PRINT_DEC
+                                     " to "
+                                     HOST_WIDE_INT_PRINT_DEC
+                                     " for instruction:\n", cur_off, offset);
+                       print_rtl_single (dump_file, insn);
+               }
+           }
+         bitmap_release (&required_deletes);
+       }
+    }
+}
+
+unsigned int
+pass_fold_mem_offsets::execute (function *fn)
+{
+  basic_block bb;
+  rtx_insn *insn;
+
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS | DF_DEFER_INSN_RESCAN);
+  df_chain_add_problem (DF_UD_CHAIN + DF_DU_CHAIN);
+  df_analyze ();
+
+  bitmap_initialize (&can_fold_insn, NULL);
+  bitmap_initialize (&pending_remove_insn, NULL);
+  bitmap_initialize (&cannot_remove_insn, NULL);
+
+  stats_folded_sp = 0;
+  stats_folded_other = 0;
+
+  FOR_ALL_BB_FN (bb, fn)
+    {
+      /* The shorten-memrefs pass runs when a BB is optimized for size
+        and moves offsets from multiple memory instructions to a common
+        add instruction.  Disable folding if optimizing for size because
+        this pass will cancel the effects of shorten-memrefs.  */
+      if (optimize_bb_for_size_p (bb))
+       continue;
+
+      bitmap_clear (&can_fold_insn);
+      bitmap_clear (&pending_remove_insn);
+      bitmap_clear (&cannot_remove_insn);
+
+      FOR_BB_INSNS (bb, insn)
+       fold_offset_mem (insn, FM_PHASE_ANALYSIS);
+
+      FOR_BB_INSNS (bb, insn)
+       fold_offset_mem (insn, FM_PHASE_VALIDITY);
+
+      FOR_BB_INSNS (bb, insn)
+       fold_offset_mem (insn, FM_PHASE_COMMIT_OFFSETS);
+
+      FOR_BB_INSNS (bb, insn)
+       fold_offset_mem (insn, FM_PHASE_COMMIT_INSNS);
+    }
+
+  statistics_counter_event (cfun, "addi with sp fold", stats_folded_sp);
+  statistics_counter_event (cfun, "other addi fold", stats_folded_other);
+
+  bitmap_release (&can_fold_insn);
+  bitmap_release (&pending_remove_insn);
+  bitmap_release (&cannot_remove_insn);
+
+  return 0;
+}
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_fold_mem_offsets (gcc::context *ctxt)
+{
+  return new pass_fold_mem_offsets (ctxt);
+}
diff --git a/gcc/config/riscv/riscv-passes.def 
b/gcc/config/riscv/riscv-passes.def
index 4084122cf0a..dc08daadc66 100644
--- a/gcc/config/riscv/riscv-passes.def
+++ b/gcc/config/riscv/riscv-passes.def
@@ -18,4 +18,5 @@
    <http://www.gnu.org/licenses/>.  */
 
 INSERT_PASS_AFTER (pass_rtl_store_motion, 1, pass_shorten_memrefs);
+INSERT_PASS_AFTER (pass_regrename, 1, pass_fold_mem_offsets);
 INSERT_PASS_BEFORE (pass_fast_rtl_dce, 1, pass_vsetvl);
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 5f78fd579bb..b89a82adb0e 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -104,6 +104,7 @@ extern void riscv_parse_arch_string (const char *, struct 
gcc_options *, locatio
 extern bool riscv_hard_regno_rename_ok (unsigned, unsigned);
 
 rtl_opt_pass * make_pass_shorten_memrefs (gcc::context *ctxt);
+rtl_opt_pass * make_pass_fold_mem_offsets (gcc::context *ctxt);
 rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
 
 /* Information about one CPU we know about.  */
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 63d4710cb15..5e1fbdbedcc 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -105,6 +105,10 @@ Convert BASE + LARGE_OFFSET addresses to NEW_BASE + 
SMALL_OFFSET to allow more
 memory accesses to be generated as compressed instructions.  Currently targets
 32-bit integer load/stores.
 
+mfold-mem-offsets
+Target Bool Var(riscv_mfold_mem_offsets) Init(1)
+Fold instructions calculating memory offsets to the memory access instruction 
if possible.
+
 mcmodel=
 Target RejectNegative Joined Enum(code_model) Var(riscv_cmodel) 
Init(TARGET_DEFAULT_CMODEL)
 Specify the code model.
diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv
index 1252d6f851a..f29cf463867 100644
--- a/gcc/config/riscv/t-riscv
+++ b/gcc/config/riscv/t-riscv
@@ -76,6 +76,10 @@ riscv-shorten-memrefs.o: 
$(srcdir)/config/riscv/riscv-shorten-memrefs.cc \
        $(COMPILE) $<
        $(POSTCOMPILE)
 
+riscv-fold-mem-offsets.o: $(srcdir)/config/riscv/riscv-fold-mem-offsets.cc
+       $(COMPILE) $<
+       $(POSTCOMPILE)
+
 riscv-selftests.o: $(srcdir)/config/riscv/riscv-selftests.cc \
   $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(TREE_H) output.h \
   $(C_COMMON_H) $(TARGET_H) $(OPTABS_H) $(EXPR_H) $(INSN_ATTR_H) $(EMIT_RTL_H)
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index ee78591c73e..39b57cab595 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1218,6 +1218,7 @@ See RS/6000 and PowerPC Options.
 -msmall-data-limit=@var{N-bytes}
 -msave-restore  -mno-save-restore
 -mshorten-memrefs  -mno-shorten-memrefs
+-mfold-mem-offsets  -mno-fold-mem-offsets
 -mstrict-align  -mno-strict-align
 -mcmodel=medlow  -mcmodel=medany
 -mexplicit-relocs  -mno-explicit-relocs
@@ -29048,6 +29049,13 @@ of 'new base + small offset'.  If the new base gets 
stored in a compressed
 register, then the new load/store can be compressed.  Currently targets 32-bit
 integer load/stores only.
 
+@opindex mfold-mem-offsets
+@item -mfold-mem-offsets
+@itemx -mno-fold-mem-offsets
+Do or do not attempt to move constant addition calculations used to for memory
+offsets to the corresponding memory instructions.  The default is
+@option{-mfold-mem-offsets} at levels @option{-O2}, @option{-O3}.
+
 @opindex mstrict-align
 @item -mstrict-align
 @itemx -mno-strict-align
diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c 
b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
new file mode 100644
index 00000000000..574cc92b6ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfold-mem-offsets" } */
+
+void sink(int arr[2]);
+
+void
+foo(int a, int b, int i)
+{
+  int arr[2] = {a, b};
+  arr[i]++;
+  sink(arr);
+}
+
+// Should compile without negative memory offsets when using -mfold-mem-offsets
+/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
+/* { dg-final { scan-assembler-not "sw\t.*,-.*\\(.*\\)" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c 
b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
new file mode 100644
index 00000000000..e6c251d3a3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-2.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfold-mem-offsets" } */
+
+void sink(int arr[3]);
+
+void
+foo(int a, int b, int c, int i)
+{
+  int arr1[3] = {a, b, c};
+  int arr2[3] = {a, c, b};
+  int arr3[3] = {c, b, a};
+
+  arr1[i]++;
+  arr2[i]++;
+  arr3[i]++;
+  
+  sink(arr1);
+  sink(arr2);
+  sink(arr3);
+}
+
+// Should compile without negative memory offsets when using -mfold-mem-offsets
+/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
+/* { dg-final { scan-assembler-not "sw\t.*,-.*\\(.*\\)" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c 
b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
new file mode 100644
index 00000000000..8586d3e3a29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/fold-mem-offsets-3.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfold-mem-offsets" } */
+
+void load(int arr[2]);
+
+int
+foo(long unsigned int i)
+{
+  int arr[2];
+  load(arr);
+
+  return arr[3 * i + 77];
+}
+
+// Should compile without negative memory offsets when using -mfold-mem-offsets
+/* { dg-final { scan-assembler-not "lw\t.*,-.*\\(.*\\)" } } */
+/* { dg-final { scan-assembler-not "addi\t.*,.*,77" } } */
\ No newline at end of file
-- 
2.34.1

Reply via email to