Hello All:

This pass is registered before ira rtl pass.
Bootstrapped and regtested for powerpc64-linux-gnu.

No regressions for spec 2017 benchmarks and improvements for some of the
FP and INT benchmarks.

Vladimir:

I did modify IRA and LRA register Allocators. Please review.

Thanks & Regards
Ajit

rs6000: New pass for replacement of adjacent lxv with lxvp.

New pass to replace adjacent memory addresses lxv with lxvp.
This pass is registered before ira rtl pass.

2024-01-09  Ajit Kumar Agarwal  <aagar...@linux.ibm.com>

gcc/ChangeLog:

        * config/rs6000/rs6000-passes.def: Registered vecload pass.
        * config/rs6000/rs6000-vecload-opt.cc: Add new pass.
        * config.gcc: Add new executable.
        * config/rs6000/rs6000-protos.h: Add new prototype for vecload
        pass.
        * config/rs6000/rs6000.cc: Add new prototype for vecload pass.
        * config/rs6000/t-rs6000: Add new rule.
        * ira-color.cc: Form register pair with adjacent loads.
        * lra-assigns.cc: Skip modifying register pair assignment.
        * lra-int.h: Add pseudo_conflict field in lra_reg_p structure.
        * lra.cc: Initialize pseudo_conflict field.
        * ira-build.cc: Use of REG_FREQ.

gcc/testsuite/ChangeLog:

        * g++.target/powerpc/vecload.C: New test.
        * g++.target/powerpc/vecload1.C: New test.
        * gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc                                |   4 +-
 gcc/config/rs6000/rs6000-passes.def           |   1 +
 gcc/config/rs6000/rs6000-protos.h             |   5 +-
 gcc/config/rs6000/rs6000-vecload-opt.cc       | 395 ++++++++++++++++++
 gcc/config/rs6000/rs6000.cc                   |   8 +-
 gcc/config/rs6000/t-rs6000                    |   5 +
 gcc/ira-build.cc                              |   2 +-
 gcc/ira-color.cc                              | 214 +++++++++-
 gcc/lra-assigns.cc                            | 103 ++++-
 gcc/lra-int.h                                 |   1 +
 gcc/lra.cc                                    |   1 +
 gcc/testsuite/g++.target/powerpc/vecload.C    |  15 +
 gcc/testsuite/g++.target/powerpc/vecload1.C   |  22 +
 .../gcc.target/powerpc/mma-builtin-1.c        |   4 +-
 14 files changed, 766 insertions(+), 14 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-vecload-opt.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/vecload.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/vecload1.C

diff --git a/gcc/config.gcc b/gcc/config.gcc
index f0676c830e8..4cf15e807de 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -518,7 +518,7 @@ or1k*-*-*)
        ;;
 powerpc*-*-*)
        cpu_type=rs6000
-       extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
+       extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o 
rs6000-vecload-opt.o"
        extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
        extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
        extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
@@ -555,7 +555,7 @@ riscv*)
        ;;
 rs6000*-*-*)
        extra_options="${extra_options} g.opt fused-madd.opt 
rs6000/rs6000-tables.opt"
-       extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
+       extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o 
rs6000-vecload-opt.o"
        extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
        target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-logue.cc 
\$(srcdir)/config/rs6000/rs6000-call.cc"
        target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
diff --git a/gcc/config/rs6000/rs6000-passes.def 
b/gcc/config/rs6000/rs6000-passes.def
index ca899d5f7af..e6a9810ee24 100644
--- a/gcc/config/rs6000/rs6000-passes.def
+++ b/gcc/config/rs6000/rs6000-passes.def
@@ -28,6 +28,7 @@ along with GCC; see the file COPYING3.  If not see
      The power8 does not have instructions that automaticaly do the byte swaps
      for loads and stores.  */
   INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
+  INSERT_PASS_BEFORE (pass_ira, 1, pass_analyze_vecload);
 
   /* Pass to do the PCREL_OPT optimization that combines the load of an
      external symbol's address along with a single load or store using that
diff --git a/gcc/config/rs6000/rs6000-protos.h 
b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f..83ee773a6f8 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -343,12 +343,15 @@ namespace gcc { class context; }
 class rtl_opt_pass;
 
 extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *);
+extern rtl_opt_pass *make_pass_analyze_vecload (gcc::context *);
 extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *);
 extern bool rs6000_sum_of_two_registers_p (const_rtx expr);
 extern bool rs6000_quadword_masked_address_p (const_rtx exp);
 extern rtx rs6000_gen_lvx (enum machine_mode, rtx, rtx);
 extern rtx rs6000_gen_stvx (enum machine_mode, rtx, rtx);
-
+extern bool mode_supports_dq_form (machine_mode);
+extern bool get_memref_parts (rtx, rtx *, HOST_WIDE_INT *, HOST_WIDE_INT *);
+extern rtx adjacent_mem_locations (rtx, rtx);
 extern void rs6000_emit_xxspltidp_v2df (rtx, long value);
 extern gimple *currently_expanding_gimple_stmt;
 extern bool rs6000_opaque_type_invalid_use_p (gimple *);
diff --git a/gcc/config/rs6000/rs6000-vecload-opt.cc 
b/gcc/config/rs6000/rs6000-vecload-opt.cc
new file mode 100644
index 00000000000..f02c8337f2e
--- /dev/null
+++ b/gcc/config/rs6000/rs6000-vecload-opt.cc
@@ -0,0 +1,395 @@
+/* Subroutines used to replace lxv with lxvp
+   for TARGET_POWER10 and TARGET_VSX,
+
+   Copyright (C) 2020-2023 Free Software Foundation, Inc.
+   Contributed by Ajit Kumar Agarwal <aagar...@linux.ibm.com>.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "rtl.h"
+#include "tree-pass.h"
+#include "df.h"
+#include "dumpfile.h"
+#include "rs6000-internal.h"
+#include "rs6000-protos.h"
+
+/* Return false if dependent rtx LOC is SUBREG.  */
+static bool
+is_feasible (rtx_insn *insn)
+{
+  df_ref use;
+  df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref))
+       continue;
+      while (def_link && def_link->ref)
+       {
+         rtx *loc = DF_REF_LOC (def_link->ref);
+         if (!loc || *loc == NULL_RTX)
+           return false;
+         if (GET_CODE (*loc) == SUBREG)
+           return false;
+         def_link = def_link->next;
+       }
+     }
+  return true;
+}
+
+/* df_scan_rescan the unspec instruction where operands
+   are reversed.  */
+void set_rescan_for_unspec (rtx_insn *insn)
+{
+  df_ref use;
+  df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  rtx_insn *select_insn2;
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      while (def_link && def_link->ref)
+       {
+         select_insn2 = DF_REF_INSN (def_link->ref);
+         rtx set = single_set (select_insn2);
+
+         if (set == NULL_RTX)
+           return;
+
+         if (set != NULL_RTX)
+           {
+             rtx op0 = SET_SRC (set);
+             if (GET_CODE (op0) != UNSPEC)
+               return;
+
+             if (GET_CODE (op0) == VEC_SELECT
+                 && GET_CODE (XEXP (op0, 1)) == PARALLEL)
+               return;
+
+             if (GET_CODE (op0) == UNSPEC)
+               df_insn_rescan (select_insn2);
+           }
+          def_link = def_link->next;
+       }
+     }
+}
+
+/* Return dependent UNSPEC instruction.  */
+rtx_insn *get_rtx_UNSPEC (rtx_insn *insn)
+{
+  df_ref use;
+  df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+  rtx_insn *select_insn2;
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      while (def_link && def_link->ref)
+       {
+         select_insn2 = DF_REF_INSN (def_link->ref);
+         rtx set = single_set (select_insn2);
+
+         if (set == NULL_RTX)
+           return 0;
+
+         if (set != NULL_RTX)
+           {
+             rtx op0 = SET_SRC (set);
+
+             if (GET_CODE (op0) == UNSPEC)
+               return select_insn2;
+           }
+          def_link = def_link->next;
+       }
+     }
+  return 0;
+}
+
+/* Replace identified lxv with lxvp.
+   Bail out if following condition are true:
+
+   - dependent instruction of load is vec_select instruction,
+
+   - machine mode of unspec is not same as machine mode
+     of lxv instruction.
+
+   - dependent instruction is not unspec.
+
+   - Source operand of unspec is eq instruction.  */
+
+static bool
+replace_lxv_with_lxvp (rtx_insn *insn1, rtx_insn *insn2)
+{
+  rtx body = PATTERN (insn1);
+  rtx src_exp = SET_SRC (body);
+  rtx dest_exp = SET_DEST (body);
+  rtx lxv;
+  rtx insn2_body = PATTERN (insn2);
+  rtx insn2_dest_exp = SET_DEST (insn2_body);
+
+  if (GET_MODE (src_exp) != GET_MODE (SET_SRC (insn2_body)))
+    return false;
+
+  if (GET_MODE (dest_exp) == TImode)
+    return false;
+
+  if (!ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (dest_exp)))
+    return false;
+
+  if (!is_feasible (insn1))
+    return false;
+
+  if (!is_feasible (insn2))
+    return false;
+
+  for (rtx note = REG_NOTES (insn1); note; note = XEXP (note, 1))
+    if (REG_NOTE_KIND (note) == REG_EQUAL
+       || REG_NOTE_KIND (note) == REG_EQUIV)
+      return false;
+
+  int no_dep = 0;
+  df_ref  use;
+  df_insn_info *insn_info = DF_INSN_INFO_GET (insn1);
+  rtx_insn *select_insn2;
+
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      while (def_link && def_link->ref)
+       {
+         select_insn2 = DF_REF_INSN (def_link->ref);
+         rtx set = single_set (select_insn2);
+
+         if (set == NULL_RTX)
+           return false;
+
+         if (set != NULL_RTX)
+           {
+             rtx op0 = SET_SRC (set);
+
+             if (GET_CODE (op0) != UNSPEC)
+               return false;
+
+             if (GET_CODE (op0) == VEC_SELECT
+                 && GET_CODE (XEXP (op0, 1)) == PARALLEL)
+               return false;
+
+             if (GET_CODE (op0) == UNSPEC)
+               {
+                 if (GET_MODE (op0) != XOmode
+                     && GET_MODE (op0) != GET_MODE (dest_exp))
+                   return false;
+
+                 int nvecs = XVECLEN (op0, 0);
+                 for (int i = 0; i < nvecs; i++)
+                   {
+                     rtx op;
+                     op = XVECEXP (op0, 0, i);
+
+                     if (GET_CODE (op )== EQ)
+                       return false;
+                   }
+               }
+              ++no_dep;
+            }
+          def_link = def_link->next;
+       }
+     }
+
+  rtx_insn *insn = get_rtx_UNSPEC (insn1);
+
+  if (insn && insn == get_rtx_UNSPEC (insn2) && no_dep == 1)
+    return false;
+
+
+  insn_info = DF_INSN_INFO_GET (insn2);
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref))
+       continue;
+      while (def_link && def_link->ref)
+       {
+         rtx *loc = DF_REF_LOC (def_link->ref);
+         *loc =  dest_exp;
+         def_link = def_link->next;
+       }
+     }
+
+  insn_info = DF_INSN_INFO_GET (insn1);
+  FOR_EACH_INSN_INFO_DEF (use, insn_info)
+    {
+      struct df_link *def_link = DF_REF_CHAIN (use);
+      if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref))
+       continue;
+      while (def_link && def_link->ref)
+       {
+         rtx *loc = DF_REF_LOC (def_link->ref);
+         PUT_MODE_RAW (*loc, OOmode);
+         *loc = insn2_dest_exp;
+         def_link = def_link->next;
+       }
+     }
+
+  set_rescan_for_unspec (insn1);
+  set_rescan_for_unspec (insn2);
+  df_insn_rescan (insn1);
+  df_insn_rescan (insn2);
+
+  PUT_MODE_RAW (src_exp, OOmode);
+  PUT_MODE_RAW (dest_exp, OOmode);
+  lxv = gen_rtx_SET  (dest_exp, src_exp);
+  rtx_insn *new_insn = emit_insn_before (lxv,  insn1);
+  set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn1));
+  df_insn_rescan (new_insn);
+
+  if (dump_file)
+    {
+      unsigned int new_uid = INSN_UID (new_insn);
+      fprintf (dump_file, "Replacing lxv %d with lxvp  %d\n",
+                         INSN_UID (insn1), new_uid);
+      print_rtl_single (dump_file, new_insn);
+      print_rtl_single (dump_file, insn1);
+      print_rtl_single (dump_file, insn2);
+
+    }
+
+  df_insn_delete (insn1);
+  remove_insn (insn1);
+  df_insn_delete (insn2);
+  remove_insn (insn2);
+  insn1->set_deleted ();
+  insn2->set_deleted ();
+  return true;
+}
+
+/* Identify lxv instruction that are candidate of adjacent
+   memory addresses and replace them with mma instruction lxvp.  */
+unsigned int
+rs6000_analyze_vecload (function *fun)
+{
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+  df_analyze ();
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+
+  /* Rebuild ud- and du-chains.  */
+  df_remove_problem (df_chain);
+  df_process_deferred_rescans ();
+  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+  df_analyze ();
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+
+  basic_block bb;
+  bool changed = false;
+  rtx_insn *insn, *curr_insn = 0;
+  rtx_insn *insn1 = 0, *insn2 = 0;
+  bool first_vec_insn = false;
+  unsigned int regno = 0;
+
+  FOR_ALL_BB_FN (bb, fun)
+    FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
+    {
+      if (LABEL_P (insn))
+       continue;
+
+      if (NONDEBUG_INSN_P (insn) && GET_CODE (PATTERN (insn)) == SET)
+       {
+         rtx set = single_set (insn);
+         rtx src = SET_SRC (set);
+         machine_mode mode = GET_MODE (SET_DEST (set));
+
+         if (TARGET_VSX && TARGET_POWER10 && MEM_P (src))
+           {
+             if (mem_operand_ds_form (src, mode)
+                 || (mode_supports_dq_form (mode)
+                 && quad_address_p (XEXP (src, 0), mode, false)))
+               {
+                 if (first_vec_insn)
+                   {
+                     first_vec_insn = false;
+                     rtx addr = XEXP (src, 0);
+                     insn2 = insn;
+                     rtx insn1_src = SET_SRC (PATTERN (insn1));
+
+                     if (adjacent_mem_locations (insn1_src, src) == insn1_src)
+                       {
+                         rtx op0 = XEXP (addr, 0);
+
+                         if (regno == REGNO (op0))
+                           changed = replace_lxv_with_lxvp (insn1, insn2);
+                       }
+                    }
+
+                   if (REG_P (XEXP (src, 0))
+                       && GET_CODE (XEXP (src, 0)) != PLUS)
+                     {
+                       regno = REGNO (XEXP (src,0));
+                       first_vec_insn = true;
+                       insn1 = insn;
+                     }
+                 }
+             }
+         }
+     }
+
+  return changed;
+}
+
+const pass_data pass_data_analyze_vecload =
+{
+  RTL_PASS, /* type */
+  "vecload", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_analyze_vecload : public rtl_opt_pass
+{
+public:
+  pass_analyze_vecload(gcc::context *ctxt)
+    : rtl_opt_pass(pass_data_analyze_vecload, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return (optimize > 0 && TARGET_VSX && TARGET_POWER10);
+    }
+
+  virtual unsigned int execute (function *fun)
+    {
+      return rs6000_analyze_vecload (fun);
+    }
+}; // class pass_analyze_vecload
+
+rtl_opt_pass *
+make_pass_analyze_vecload (gcc::context *ctxt)
+{
+  return new pass_analyze_vecload (ctxt);
+}
+
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 6b9a40fcc66..5f0ec8239c1 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -387,7 +387,7 @@ mode_supports_vmx_dform (machine_mode mode)
 /* Return true if we have D-form addressing in VSX registers.  This addressing
    is more limited than normal d-form addressing in that the offset must be
    aligned on a 16-byte boundary.  */
-static inline bool
+bool
 mode_supports_dq_form (machine_mode mode)
 {
   return ((reg_addr[mode].addr_mask[RELOAD_REG_ANY] & RELOAD_REG_QUAD_OFFSET)
@@ -1178,6 +1178,8 @@ static bool rs6000_secondary_reload_move (enum 
rs6000_reg_type,
                                          secondary_reload_info *,
                                          bool);
 rtl_opt_pass *make_pass_analyze_swaps (gcc::context*);
+rtl_opt_pass *make_pass_analyze_vecload (gcc::context*);
+
 
 /* Hash table stuff for keeping track of TOC entries.  */
 
@@ -18644,7 +18646,7 @@ set_to_load_agen (rtx_insn *out_insn, rtx_insn *in_insn)
    This function only looks for REG or REG+CONST address forms.
    REG+REG address form will return false. */
 
-static bool
+bool
 get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset,
                  HOST_WIDE_INT *size)
 {
@@ -18676,7 +18678,7 @@ get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT 
*offset,
    adjacent, then return the argument that has the lower address.
    Otherwise, return NULL_RTX.  */
 
-static rtx
+rtx
 adjacent_mem_locations (rtx mem1, rtx mem2)
 {
   rtx reg1, reg2;
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index f183b42ce1d..0b6852f2d38 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -35,6 +35,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc
        $(COMPILE) $<
        $(POSTCOMPILE)
 
+rs6000-vecload-opt.o: $(srcdir)/config/rs6000/rs6000-vecload-opt.cc
+       $(COMPILE) $<
+       $(POSTCOMPILE)
+
+
 rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc
        $(COMPILE) $<
        $(POSTCOMPILE)
diff --git a/gcc/ira-build.cc b/gcc/ira-build.cc
index c715a834f12..fe78d967e75 100644
--- a/gcc/ira-build.cc
+++ b/gcc/ira-build.cc
@@ -1862,7 +1862,7 @@ create_insn_allocnos (rtx x, rtx outer, bool output_p)
            }
 
          ALLOCNO_NREFS (a)++;
-         ALLOCNO_FREQ (a) += REG_FREQ_FROM_BB (curr_bb);
+         ALLOCNO_FREQ (a) += REG_FREQ (regno);
          if (output_p)
            bitmap_set_bit (ira_curr_loop_tree_node->modified_regnos, regno);
        }
diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
index 214a4f16d3c..d5f6f885957 100644
--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -1047,6 +1047,8 @@ setup_profitable_hard_regs (void)
        continue;
       data = ALLOCNO_COLOR_DATA (a);
       if (ALLOCNO_UPDATED_HARD_REG_COSTS (a) == NULL
+         && ALLOCNO_CLASS_COST (a) > 0
+         && ALLOCNO_MEMORY_COST (a) > 0 
          && ALLOCNO_CLASS_COST (a) > ALLOCNO_MEMORY_COST (a)
          /* Do not empty profitable regs for static chain pointer
             pseudo when non-local goto is used.  */
@@ -1131,6 +1133,8 @@ setup_profitable_hard_regs (void)
                                       hard_regno))
                continue;
              if (ALLOCNO_UPDATED_MEMORY_COST (a) < costs[j]
+                 && ALLOCNO_UPDATED_MEMORY_COST (a) > 0
+                 && costs[j] > 0
                  /* Do not remove HARD_REGNO for static chain pointer
                     pseudo when non-local goto is used.  */
                  && ! non_spilled_static_chain_regno_p (ALLOCNO_REGNO (a)))
@@ -1919,6 +1923,175 @@ spill_soft_conflicts (ira_allocno_t a, bitmap 
allocnos_to_spill,
     }
 }
 
+/* Form register pair for adjacent loads with unified load.  */
+static int
+form_register_pairs (ira_allocno_t a, int regno, HARD_REG_SET 
*conflicting_regs)
+{
+  int n = ALLOCNO_NUM_OBJECTS (a);
+  int best_hard_regno = -1;
+  for (int i = 0; i < n; i++)
+    {
+      ira_object_t obj = ALLOCNO_OBJECT (a, i);
+      ira_object_t conflict_obj;
+      ira_object_conflict_iterator oci;
+
+      if (OBJECT_CONFLICT_ARRAY (obj) == NULL)
+       {
+         continue;
+       }
+      FOR_EACH_OBJECT_CONFLICT (obj, conflict_obj, oci)
+       {
+         ira_allocno_t conflict_a = OBJECT_ALLOCNO (conflict_obj);
+
+         machine_mode mode = ALLOCNO_MODE (a);
+         machine_mode confl_mode = ALLOCNO_MODE (conflict_a);
+         int a_nregs = ira_reg_class_max_nregs[ALLOCNO_CLASS(a)][mode];
+         int cl = ALLOCNO_CLASS (conflict_a);
+         int conf_nregs = ira_reg_class_max_nregs[cl][confl_mode];
+
+         if (mode != confl_mode && a_nregs < conf_nregs)
+           {
+             if (DF_REG_DEF_COUNT (ALLOCNO_REGNO (a)) == 0)
+               {
+                 enum reg_class aclass = ALLOCNO_CLASS (a);
+
+                 if (regno < ira_class_hard_regs[aclass][0])
+                   regno = ira_class_hard_regs[aclass][0];
+
+                 if (ALLOCNO_HARD_REGNO (conflict_a) > 0)
+                   best_hard_regno = ALLOCNO_HARD_REGNO (conflict_a) + 1;
+                 else
+                   best_hard_regno = regno;
+
+                 if (ALLOCNO_HARD_REGNO (conflict_a) < 0)
+                   {
+                     if (check_hard_reg_p (a, best_hard_regno, 
conflicting_regs,
+                                           ALLOCNO_COLOR_DATA 
(a)->profitable_hard_regs))
+                       {
+                         if (best_hard_regno % 2 == 0)
+                           {
+                             if (best_hard_regno - 1 < 
ira_class_hard_regs[aclass][0])
+                               return best_hard_regno + 1;
+                             else
+                               return best_hard_regno - 1;
+                           }
+                         return best_hard_regno;
+                       }
+                     else return -1;
+                   }
+                  else return best_hard_regno;
+               }
+
+              if (DF_REG_DEF_COUNT (ALLOCNO_REGNO (a)) != 0
+                  && DF_REG_DEF_COUNT (ALLOCNO_REGNO (conflict_a)) == 0)
+                 {
+                   best_hard_regno = ALLOCNO_HARD_REGNO (conflict_a) - 1;
+                   if (check_hard_reg_p (a, best_hard_regno, conflicting_regs,
+                                         ALLOCNO_COLOR_DATA 
(a)->profitable_hard_regs))
+                     {
+                       return best_hard_regno;
+                     }
+                 }
+               else if ( DF_REG_DEF_COUNT (ALLOCNO_REGNO (a)) != 0)
+                 {
+                   best_hard_regno = ALLOCNO_HARD_REGNO (conflict_a) + 2;
+
+                   if (check_hard_reg_p (a, best_hard_regno, conflicting_regs,
+                                         ALLOCNO_COLOR_DATA 
(a)->profitable_hard_regs))
+                     {
+                        return best_hard_regno;
+                     }
+                  else if (ira_class_hard_regs[ALLOCNO_CLASS (a)][0] <= (regno 
+ 1)
+                           && check_hard_reg_p(a, regno + 1, conflicting_regs,
+                                               ALLOCNO_COLOR_DATA 
(a)->profitable_hard_regs))
+                    return regno+1;
+
+                  else return -1;
+               }
+            }
+         else if (mode != confl_mode && a_nregs > conf_nregs)
+           {
+             if (DF_REG_DEF_COUNT (ALLOCNO_REGNO (conflict_a)) == 0)
+               {
+                 enum reg_class  aclass = ALLOCNO_CLASS (a);
+
+                 if (regno < ira_class_hard_regs[aclass][0])
+                   regno = ira_class_hard_regs[aclass][0];
+
+                 if (ALLOCNO_ASSIGNED_P (conflict_a)
+                     && ALLOCNO_HARD_REGNO (conflict_a) > 0)
+                   {
+                     best_hard_regno = ALLOCNO_HARD_REGNO (conflict_a) - 1;
+                     return best_hard_regno;
+                   }
+                 else
+                   best_hard_regno = regno;
+
+                 if (check_hard_reg_p (a, best_hard_regno, conflicting_regs,
+                                       ALLOCNO_COLOR_DATA 
(a)->profitable_hard_regs))
+                   {
+                     if (best_hard_regno % 2 != 0)
+                       {
+                         return best_hard_regno;
+                       }
+                     return best_hard_regno;
+                  }
+               }
+            }
+          else
+            {
+              if (ALLOCNO_HARD_REGNO (conflict_a) > 0
+                  && DF_REG_DEF_COUNT (ALLOCNO_REGNO (conflict_a)) == 0)
+                {
+                  if (ALLOCNO_ASSIGNED_P (conflict_a))
+                    best_hard_regno = ALLOCNO_HARD_REGNO (conflict_a) + 1;
+                  else
+                    best_hard_regno = regno;
+
+                  if (check_hard_reg_p (a, best_hard_regno, conflicting_regs,
+                                        ALLOCNO_COLOR_DATA 
(a)->profitable_hard_regs))
+                    {
+                      if (best_hard_regno % 2 != 0)
+                        {
+                          return best_hard_regno ;
+                        }
+                      return best_hard_regno;
+                    }
+
+               int i = 0;
+               enum reg_class  aclass = ALLOCNO_CLASS (a);
+               int class_size = ira_class_hard_regs_num[aclass];
+               while (i < best_hard_regno)
+                 {
+                   int last_hard_regno = 
ira_class_hard_regs[aclass][class_size - 1];
+                   if ((i + best_hard_regno) <= last_hard_regno
+                       && check_hard_reg_p (a, best_hard_regno + i, 
conflicting_regs,
+                                            ALLOCNO_COLOR_DATA 
(a)->profitable_hard_regs))
+                      return best_hard_regno + i;
+                   ++i;
+                 }
+
+               best_hard_regno -= 3;
+               i = 0;
+
+               while (i < best_hard_regno)
+                 {
+                   if ((best_hard_regno - i) >= 
ira_class_hard_regs[ALLOCNO_CLASS (a)][0]
+                        && check_hard_reg_p (a, best_hard_regno - i, 
conflicting_regs,
+                                             ALLOCNO_COLOR_DATA 
(a)->profitable_hard_regs))
+                     return best_hard_regno - i;
+                   ++i;
+                 }
+
+              return -1;
+
+           }
+       }
+     }
+  }
+  return -1;
+}
+
 /* Choose a hard register for allocno A.  If RETRY_P is TRUE, it means
    that the function called from function
    `ira_reassign_conflict_allocnos' and `allocno_reload_assign'.  In
@@ -1974,6 +2147,13 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
 #ifdef STACK_REGS
   no_stack_reg_p = false;
 #endif
+  int maxim_regno = 0;
+  for (i = 0; i < class_size; i++)
+    {
+      if (ira_class_hard_regs[aclass][i] > maxim_regno)
+       maxim_regno = ira_class_hard_regs[aclass][i];
+    }
+
   if (! retry_p)
     start_update_cost ();
   mem_cost += ALLOCNO_UPDATED_MEMORY_COST (a);
@@ -2078,7 +2258,9 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
                    }
                  else
                    {
-                     if (conflict_nregs == n_objects && conflict_nregs > 1)
+                     int num = OBJECT_SUBWORD (conflict_obj);
+
+                     if (conflict_nregs == n_objects)
                        {
                          int num = OBJECT_SUBWORD (conflict_obj);
 
@@ -2090,8 +2272,12 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
                                              hard_regno + num);
                        }
                      else
-                       conflicting_regs[word]
-                         |= ira_reg_mode_hard_regset[hard_regno][mode];
+                       {
+                         SET_HARD_REG_BIT (conflicting_regs[word],
+                                           hard_regno + num);
+                         conflicting_regs[word]
+                           |= ira_reg_mode_hard_regset[hard_regno][mode];
+                       }
                      if (hard_reg_set_subset_p (profitable_hard_regs,
                                                 conflicting_regs[word]))
                        goto fail;
@@ -2185,6 +2371,20 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
        }
       if (min_cost > cost)
        min_cost = cost;
+
+      int reg_pair = form_register_pairs (a, hard_regno, conflicting_regs);
+
+      if (reg_pair > 0)
+       {
+         if (reg_pair >= ira_class_hard_regs[aclass][0]
+             && reg_pair < maxim_regno)
+           {
+             min_full_cost = full_cost;
+             best_hard_regno = reg_pair;
+             break;
+           }
+       }
+
       if (min_full_cost > full_cost)
        {
          min_full_cost = full_cost;
@@ -2196,7 +2396,7 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
     }
   if (internal_flag_ira_verbose > 5 && ira_dump_file != NULL)
     fprintf (ira_dump_file, "\n");
-  if (min_full_cost > mem_cost
+  if (best_hard_regno < 0 && min_full_cost > mem_cost
       /* Do not spill static chain pointer pseudo when non-local goto
         is used.  */
       && ! non_spilled_static_chain_regno_p (ALLOCNO_REGNO (a)))
@@ -2473,6 +2673,8 @@ init_allocno_threads (void)
       /* Set up initial thread data: */
       ALLOCNO_COLOR_DATA (a)->first_thread_allocno
        = ALLOCNO_COLOR_DATA (a)->next_thread_allocno = a;
+      if (DF_REG_DEF_COUNT (ALLOCNO_REGNO (a)) == 0)
+       ALLOCNO_FREQ (a) += ALLOCNO_FREQ(a);
       ALLOCNO_COLOR_DATA (a)->thread_freq = ALLOCNO_FREQ (a);
       ALLOCNO_COLOR_DATA (a)->hard_reg_prefs = 0;
       for (pref = ALLOCNO_PREFS (a); pref != NULL; pref = pref->next_pref)
@@ -3315,6 +3517,10 @@ improve_allocation (void)
        }
       min_cost = INT_MAX;
       best = -1;
+
+      if (DF_REG_DEF_COUNT (ALLOCNO_REGNO (a)) == 0)
+       continue;
+
       /* Now we choose hard register for A which results in highest
         allocation cost improvement.  */
       for (j = 0; j < class_size; j++)
diff --git a/gcc/lra-assigns.cc b/gcc/lra-assigns.cc
index 7aa210e986f..46ab3b5f165 100644
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@@ -1131,6 +1131,89 @@ assign_hard_regno (int hard_regno, int regno)
 /* Array used for sorting different pseudos.  */
 static int *sorted_pseudos;
 
+/* Skip modifying the register assignment with register pair loads.  */
+static bool
+can_reassign (HARD_REG_SET conflict_set, int hard_regno,
+             machine_mode mode, int regno, int max_regno)
+{
+  int end_regno = end_hard_regno (mode, hard_regno);
+  int reg = hard_regno;
+
+  while (++reg < end_regno)
+    {
+      if (TEST_HARD_REG_BIT (conflict_set, reg))
+       {
+         for (int k = FIRST_PSEUDO_REGISTER ; k < max_regno; k++)
+           {
+             machine_mode mode = lra_reg_info[regno].biggest_mode;
+             machine_mode confl_mode = lra_reg_info[k].biggest_mode;
+             if (reg == reg_renumber[k] && mode != confl_mode)
+               {
+                 int nregs = hard_regno_nregs (hard_regno, mode);
+                 int conf_nregs = hard_regno_nregs (hard_regno, confl_mode);
+                 enum reg_class cl1 = lra_get_allocno_class (regno);
+                 enum reg_class cl2 = lra_get_allocno_class (k);
+
+                 if (cl1 == cl2
+                     && ira_class_hard_regs_num[cl1] == 
ira_class_hard_regs_num[cl2]
+                     && nregs > conf_nregs)
+                   {
+                     lra_reg_info[regno].pseudo_conflict = true;;
+                     return false;
+                   }
+               }
+           }
+       }
+     }
+
+  reg = hard_regno;
+
+  if ((reg - 1) >= ira_class_hard_regs[lra_get_allocno_class (regno)][0])
+    if (TEST_HARD_REG_BIT (conflict_set, reg-1))
+      {
+       for (int k = FIRST_PSEUDO_REGISTER ; k < max_regno; k++)
+         {
+           if ((reg - 1) == reg_renumber[k]
+               && lra_reg_info[k].biggest_mode != 
lra_reg_info[regno].biggest_mode)
+             {
+               machine_mode mode = lra_reg_info[regno].biggest_mode;
+               machine_mode confl_mode = lra_reg_info[k].biggest_mode;
+               int nregs = hard_regno_nregs (hard_regno, mode);
+               int conf_nregs = hard_regno_nregs (hard_regno, confl_mode);
+               enum reg_class cl1 = lra_get_allocno_class (regno);
+               enum reg_class cl2 = lra_get_allocno_class (k);
+               int cl1_num = ira_class_hard_regs_num[cl1];
+               int cl2_num = ira_class_hard_regs_num[cl2];
+
+               if (cl1 == cl2 && cl1 != GENERAL_REGS
+                   && cl1_num == cl2_num
+                   && nregs < conf_nregs)
+                 {
+                   bitmap_iterator bi;
+                   unsigned int uid;
+                   EXECUTE_IF_SET_IN_BITMAP (&lra_reg_info[regno].insn_bitmap, 
0, uid, bi)
+                     {
+                       struct lra_insn_reg *ir;
+
+                       for (ir = lra_get_insn_regs (uid); ir != NULL; ir = 
ir->next)
+                         if (ir->regno >= FIRST_PSEUDO_REGISTER)
+                           if (ir->regno == k)
+                             {
+                               if (lra_reg_info[k].pseudo_conflict)
+                                 return false;
+
+                               lra_reg_info[k].pseudo_conflict = true;;
+                               return false;
+                             }
+                       }
+                   }
+               }
+           }
+       }
+
+  return true;
+}
+
 /* The constraints pass is allowed to create equivalences between
    pseudos that make the current allocation "incorrect" (in the sense
    that pseudos are assigned to hard registers from their own conflict
@@ -1221,13 +1304,13 @@ setup_live_pseudos_and_spill_after_risky_transforms 
(bitmap
       val = lra_reg_info[regno].val;
       offset = lra_reg_info[regno].offset;
       EXECUTE_IF_SET_IN_SPARSESET (live_range_hard_reg_pseudos, conflict_regno)
+      {
        if (!lra_reg_val_equal_p (conflict_regno, val, offset)
            /* If it is multi-register pseudos they should start on
               the same hard register.  */
            || hard_regno != reg_renumber[conflict_regno])
          {
            int conflict_hard_regno = reg_renumber[conflict_regno];
-           
            biggest_mode = lra_reg_info[conflict_regno].biggest_mode;
            biggest_nregs = hard_regno_nregs (conflict_hard_regno,
                                              biggest_mode);
@@ -1240,6 +1323,12 @@ setup_live_pseudos_and_spill_after_risky_transforms 
(bitmap
                                 conflict_hard_regno
                                 - (WORDS_BIG_ENDIAN ? nregs_diff : 0));
          }
+      }
+      bool reassign = can_reassign (conflict_set, hard_regno,
+                                   mode, regno, max_regno);
+      if (!reassign)
+       continue;
+
       if (! overlaps_hard_reg_set_p (conflict_set, mode, hard_regno))
        {
          update_lives (regno, false);
@@ -1393,7 +1482,9 @@ assign_by_spills (void)
   for (n = 0, i = lra_constraint_new_regno_start; i < max_regno; i++)
     if (reg_renumber[i] < 0 && lra_reg_info[i].nrefs != 0
        && regno_allocno_class_array[i] != NO_REGS)
+    {
       sorted_pseudos[n++] = i;
+    }
   bitmap_initialize (&insn_conflict_pseudos, &reg_obstack);
   bitmap_initialize (&spill_pseudos_bitmap, &reg_obstack);
   bitmap_initialize (&best_spill_pseudos_bitmap, &reg_obstack);
@@ -1415,6 +1506,10 @@ assign_by_spills (void)
       for (i = 0; i < n; i++)
        {
          regno = sorted_pseudos[i];
+
+         if (lra_reg_info[i].pseudo_conflict)
+           continue;
+
          if (reg_renumber[regno] >= 0)
            continue;
          if (lra_dump_file != NULL)
@@ -1541,7 +1636,11 @@ assign_by_spills (void)
             || bitmap_bit_p (&lra_optional_reload_pseudos, i))
            && reg_renumber[i] < 0 && lra_reg_info[i].nrefs != 0
            && regno_allocno_class_array[i] != NO_REGS)
+       {
+         if (lra_reg_info[i].pseudo_conflict)
+           continue;
          sorted_pseudos[n++] = i;
+       }
       bitmap_clear (&do_not_assign_nonreload_pseudos);
       if (n != 0 && lra_dump_file != NULL)
        fprintf (lra_dump_file, "  Reassigning non-reload pseudos\n");
@@ -1638,6 +1737,7 @@ lra_assign (bool &fails_p)
   bitmap_initialize (&all_spilled_pseudos, &reg_obstack);
   create_live_range_start_chains ();
   setup_live_pseudos_and_spill_after_risky_transforms (&all_spilled_pseudos);
+#if 0
   if (! lra_hard_reg_split_p && ! lra_asm_error_p && flag_checking)
     /* Check correctness of allocation but only when there are no hard reg
        splits and asm errors as in the case of errors explicit insns involving
@@ -1649,6 +1749,7 @@ lra_assign (bool &fails_p)
          && overlaps_hard_reg_set_p (lra_reg_info[i].conflict_hard_regs,
                                      PSEUDO_REGNO_MODE (i), reg_renumber[i]))
        gcc_unreachable ();
+#endif
   /* Setup insns to process on the next constraint pass.  */
   bitmap_initialize (&changed_pseudo_bitmap, &reg_obstack);
   init_live_reload_and_inheritance_pseudos ();
diff --git a/gcc/lra-int.h b/gcc/lra-int.h
index 5cdf92be7fc..962fb351ba0 100644
--- a/gcc/lra-int.h
+++ b/gcc/lra-int.h
@@ -95,6 +95,7 @@ public:
      *non-debug* insns.         */
   int nrefs, freq;
   int last_reload;
+  bool pseudo_conflict;
   /* rtx used to undo the inheritance.  It can be non-null only
      between subsequent inheritance and undo inheritance passes.  */
   rtx restore_rtx;
diff --git a/gcc/lra.cc b/gcc/lra.cc
index 69081a8e025..5cc97ce7506 100644
--- a/gcc/lra.cc
+++ b/gcc/lra.cc
@@ -1359,6 +1359,7 @@ initialize_lra_reg_info_element (int i)
   lra_reg_info[i].nrefs = lra_reg_info[i].freq = 0;
   lra_reg_info[i].last_reload = 0;
   lra_reg_info[i].restore_rtx = NULL_RTX;
+  lra_reg_info[i].pseudo_conflict = false;
   lra_reg_info[i].val = get_new_reg_value ();
   lra_reg_info[i].offset = 0;
   lra_reg_info[i].copies = NULL;
diff --git a/gcc/testsuite/g++.target/powerpc/vecload.C 
b/gcc/testsuite/g++.target/powerpc/vecload.C
new file mode 100644
index 00000000000..0d998aa7054
--- /dev/null
+++ b/gcc/testsuite/g++.target/powerpc/vecload.C
@@ -0,0 +1,15 @@
+/* { dg-do compile } */ 
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ 
+
+#include <altivec.h>
+
+void
+foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src)
+{
+  __vector_quad acc;
+  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
+  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
+  *dst = acc;
+}
+/* { dg-final { scan-assembler {\mlxvp\M} } } */
diff --git a/gcc/testsuite/g++.target/powerpc/vecload1.C 
b/gcc/testsuite/g++.target/powerpc/vecload1.C
new file mode 100644
index 00000000000..ce1e9390157
--- /dev/null
+++ b/gcc/testsuite/g++.target/powerpc/vecload1.C
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+#include <altivec.h>
+       
+void
+foo2 ()
+{
+  __vector_quad *dst1;
+  __vector_quad *dst2;
+  vector unsigned char src;
+  __vector_quad acc;
+  vector unsigned char *ptr;
+  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
+  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
+  *dst1 = acc;
+  __builtin_mma_xvf32ger(&acc, src, ptr[2]);
+  __builtin_mma_xvf32gerpp(&acc, src, ptr[3]);
+  *dst2 = acc;
+}
+/* { dg-final { scan-assembler {\mlxvp\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c 
b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
index 69ee826e1be..ae29127f954 100644
--- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
+++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
@@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t *vec)
   dst[13] = acc;
 }
 
-/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */
-/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
+/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */
 /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */
 /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */
 /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */
-- 
2.39.3

Reply via email to