This is the final set of patches that I have available right now.  We will be
doing additional patches over the summer.

The primary thing in this patch is to add support for load fusion in the
power8.  Power8 has two types of fusion:

        addi <a>,<b>,<const>
        lxvd2x <va>,<b>,<c>

and:

        addis <a>,<b>,<const-hi>
        ld <a>,<const-lo>(<a>)

These instructions must be adjacent to each other, and in the case of fusion in
loading GPRs, the register being loaded must be the base register to load from
it.  In this patch, I added peepholes to cover this case.  In the future, I
plan on reworking the problem by being more liberal in what addresses are
allowed before reload/lra, and in lra, generate these forms.  However, these
peepholes do help find fusion cases.

I also added two switches (-mlra and -mconstrain-regs) that were used in
converting the powerpc port to use the LRA register allocator.  Note, at the
present time, Vlad and I are going back on forth on additional things needed
for LRA.

This patch bootstraps and has no regressions in the test suite.  Is it ok to
check in after the previous 7 patches have been applied?

FWIW, patches 1-2 that were approved have now been checked in.

2013-05-22  Michael Meissner  <meiss...@linux.vnet.ibm.com>

        * config/rs6000/predicates.md (fusion_gpr_addis): New predicates
        to support power8 load fusion.
        (fusion_gpr_mem_load): Likewise.

        * config/rs6000/rs6000-modes.def (PTImode): Update a comment.

        * config/rs6000/rs6000-protos.h (fusion_gpr_load_p): New
        declarations for power8 load fusion.
        (emit_fusion_gpr_load): Likewise.

        * config/rs6000/rs6000.opt (-mlra): New undocumented switch to
        turn on using the LRA register allocator.
        (-mconstrain-regs): New undocumented switch to constrain
        non-integer values from being loaded into the LR or CTR registers.

        * config/rs6000/rs6000.c (TARGET_LRA_P): If -mlra, turn on using
        the LRA register allocator.
        (rs6000_lra_p): Likewise.
        (rs6000_hard_regno_mode_ok): Allow DI/DD/SF/SD modes in altivec
        registers if power8.  If -mconstrain-regs, only allow int modes
        into LR, CTR, and special purpose registers.
        (rs6000_debug_reg_global): Print -mlra, -mconstrain-regs status if
        debugging.
        (rs6000_init_hard_regno_mode_ok): Mark that SFmode can use Altivec
        registers in the future.
        (rs6000_option_override_internal): If tuning for power8, turn on
        fusion mode by default.  Turn on sign extending fusion mode if
        normal fusion mode is on, and we are at -O2 or -O3.
        (rs6000_opt_masks): Add -mlra, -mconstrain-regs.
        (fusion_gpr_load_p): New function, return true if we can fuse an
        addis instruction with a dependent load to a GPR.
        (emit_fusion_gpr_load): Emit the instructions for power8 load
        fusion to GPRs.

        * config/rs6000/vsx.md (VSX load fusion peepholes): Add peepholes
        to fuse together an addi instruction with a VSX load instruction.

        * config/rs6000/rs6000.md (GPR load fusion peepholes): Add
        peepholes to fuse an addis instruction with a load to a GPR base
        register, if the addis instruction is dead after the load, by
        using the register to be loaded for the addis.  If we are
        supporting sign extending fusions, convert sign extending loads to
        zero extending loads and an explicit sign extension.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460, USA
email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797
Index: gcc/config/rs6000/predicates.md
===================================================================
--- gcc/config/rs6000/predicates.md     (revision 199168)
+++ gcc/config/rs6000/predicates.md     (working copy)
@@ -1654,3 +1654,99 @@ (define_predicate "small_toc_ref"
 
   return GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_TOCREL;
 })
+
+;; Match the first insn (addis) in fusing the combination of addis and loads to
+;; GPR registers on power8.  Power8 currently will only do the fusion if the
+;; top 11 bits of the addis value are all 1's or 0's.
+(define_predicate "fusion_gpr_addis"
+  (match_code "const_int,high,plus")
+{
+  HOST_WIDE_INT value;
+  rtx int_const;
+
+  /* 32-bit is not done yet.  */
+  if (TARGET_ELF && !TARGET_POWERPC64)
+    return 0;
+
+  if (GET_CODE (op) == HIGH)
+    return 1;
+
+  if (CONST_INT_P (op))
+    int_const = op;
+
+  else if (GET_CODE (op) == PLUS
+          && base_reg_operand (XEXP (op, 0), Pmode)
+          && CONST_INT_P (XEXP (op, 1)))
+    int_const = XEXP (op, 1);
+
+  else
+    return 0;
+
+  value = INTVAL (int_const);
+  if ((value & (HOST_WIDE_INT)0xffff) != 0)
+    return 0;
+
+  if ((value & (HOST_WIDE_INT)0xffff0000) == 0)
+    return 0;
+
+  return (IN_RANGE (value >> 16, -32, 31));
+})
+
+;; Match the second insn (lbz, lhz, lwz, ld) in fusing the combination of addis
+;; and loads to GPR registers on power8.
+(define_predicate "fusion_gpr_mem_load"
+  (match_code "mem")
+{
+  rtx addr;
+
+  if (!MEM_P (op))
+    return 0;
+
+  switch (mode)
+    {
+    case QImode:
+    case HImode:
+    case SImode:
+      break;
+
+    case DImode:
+      if (!TARGET_POWERPC64)
+       return 0;
+      break;
+
+    default:
+      return 0;
+    }
+
+  addr = XEXP (op, 0);
+  if (GET_CODE (addr) == PLUS)
+    {
+      rtx base = XEXP (addr, 0);
+      rtx offset = XEXP (addr, 1);
+
+      return (base_reg_operand (base, GET_MODE (base))
+             && satisfies_constraint_I (offset));
+    }
+
+  else if (GET_CODE (addr) == LO_SUM)
+    {
+      rtx base = XEXP (addr, 0);
+      rtx offset = XEXP (addr, 1);
+
+      /* 32-bit is not done yet.  */
+      if (TARGET_ELF && !TARGET_POWERPC64)
+      return 0;
+
+      if (!base_reg_operand (base, GET_MODE (base)))
+       return 0;
+
+      else if (TARGET_XCOFF || (TARGET_ELF && TARGET_POWERPC64))
+       return small_toc_ref (offset, GET_MODE (offset));
+/*
+      else if (TARGET_ELF && !TARGET_POWERPC64)
+       return CONSTANT_P (offset);
+*/
+    }
+
+  return 0;
+})
Index: gcc/config/rs6000/rs6000-modes.def
===================================================================
--- gcc/config/rs6000/rs6000-modes.def  (revision 199037)
+++ gcc/config/rs6000/rs6000-modes.def  (working copy)
@@ -42,5 +42,7 @@ VECTOR_MODES (FLOAT, 8);      /*        
 VECTOR_MODES (FLOAT, 16);     /*       V8HF  V4SF V2DF */
 VECTOR_MODES (FLOAT, 32);     /*       V16HF V8SF V4DF */
 
-/* Replacement for TImode that only is allowed in GPRs.  */
+/* Replacement for TImode that only is allowed in GPRs.  We also use PTImode
+   for quad memory atomic operations to force getting an even/odd register
+   combination.  */
 PARTIAL_INT_MODE (TI);
Index: gcc/config/rs6000/rs6000-protos.h
===================================================================
--- gcc/config/rs6000/rs6000-protos.h   (revision 199200)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -73,6 +73,8 @@ extern int mems_ok_for_quad_peep (rtx, r
 extern bool gpr_or_gpr_p (rtx, rtx);
 extern bool direct_move_p (rtx, rtx);
 extern bool quad_load_store_p (rtx, rtx);
+extern bool fusion_gpr_load_p (rtx, rtx, rtx, rtx, rtx);
+extern const char *emit_fusion_gpr_load (rtx, rtx, rtx, rtx);
 extern enum reg_class (*rs6000_preferred_reload_class_ptr) (rtx,
                                                            enum reg_class);
 extern enum reg_class (*rs6000_secondary_reload_class_ptr) (enum reg_class,
Index: gcc/config/rs6000/rs6000.opt
===================================================================
--- gcc/config/rs6000/rs6000.opt        (revision 199122)
+++ gcc/config/rs6000/rs6000.opt        (working copy)
@@ -542,3 +542,11 @@ Use ISA 2.07 direct move between GPR & V
 mquad-memory
 Target Report Mask(QUAD_MEMORY) Var(rs6000_isa_flags)
 Generate the quad word memory instructions (lq/stq/lqarx/stqcx).
+
+mlra
+Target Report Mask(LRA) Var(rs6000_isa_flags)
+Enable the use of the LRA (local register allocator).
+
+mconstrain-regs
+Target Undocumented Mask(CONSTRAIN_REGS) Var(rs6000_isa_flags)
+; Only allow ints of certain modes to go in SPRs
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c  (revision 199210)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -1044,6 +1044,7 @@ static bool rs6000_debug_cannot_change_m
                                                   enum machine_mode,
                                                   enum reg_class);
 static bool rs6000_save_toc_in_prologue_p (void);
+static bool rs6000_lra_p (void);
 
 rtx (*rs6000_legitimize_reload_address_ptr) (rtx, enum machine_mode, int, int,
                                             int, int *)
@@ -1519,6 +1520,9 @@ static const struct attribute_spec rs600
 
 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK rs6000_vectorize_vec_perm_const_ok
+
+#undef TARGET_LRA_P
+#define TARGET_LRA_P rs6000_lra_p
 
 
 /* Processor table.  */
@@ -1631,6 +1635,18 @@ rs6000_hard_regno_mode_ok (int regno, en
   if (mode == TImode && TARGET_VSX_TIMODE && VSX_REGNO_P (regno))
     return 1;
 
+  /* Allow 64-bit and the 32-bit floating point types in Altivec registers
+     under Power8.  In theory, we would allow 32-bit integers as well.  We
+     allow SDmode, even though no decimal operation works the Altivec
+     registers, but it is ok for moves.  */
+  if (TARGET_VSX && VSX_REGNO_P (regno) && TARGET_P8_VECTOR
+      && VECTOR_MEM_VSX_P (DFmode)
+      && (mode == DImode
+         || mode == DDmode
+         || mode == SFmode
+         || mode == SDmode))
+    return 1;
+
   /* The GPRs can hold any mode, but values bigger than one register
      cannot go past R31.  */
   if (INT_REGNO_P (regno))
@@ -1671,6 +1687,18 @@ rs6000_hard_regno_mode_ok (int regno, en
   if (SPE_SIMD_REGNO_P (regno) && TARGET_SPE && SPE_VECTOR_MODE (mode))
     return 1;
 
+  /* See if we need to be stricter about what goes into the special
+     registers (LR, CTR, VSAVE, VSCR).  */
+  if (TARGET_CONSTRAIN_REGS)
+    {
+      if (regno == LR_REGNO || regno == CTR_REGNO)
+       return (GET_MODE_CLASS (mode) == MODE_INT
+               && rs6000_hard_regno_nregs[mode][regno] == 1);
+
+      if (regno == VRSAVE_REGNO || regno == VSCR_REGNO)
+       return (mode == SImode);
+    }
+
   /* We cannot put non-VSX TImode or PTImode anywhere except general register
      and it must be able to fit within the register set.  */
 
@@ -2138,6 +2166,9 @@ rs6000_debug_reg_global (void)
     fprintf (stderr, DEBUG_FMT_S, "p8 fusion",
             (TARGET_P8_FUSION_SIGN) ? "zero+sign" : "zero");
 
+  if (TARGET_CONSTRAIN_REGS)
+    fprintf (stderr, DEBUG_FMT_S, "constrain-regs", "true");
+
   fprintf (stderr, DEBUG_FMT_S, "plt-format",
           TARGET_SECURE_PLT ? "secure" : "bss");
   fprintf (stderr, DEBUG_FMT_S, "struct-return",
@@ -2321,6 +2352,15 @@ rs6000_init_hard_regno_mode_ok (bool glo
       rs6000_vector_align[TImode] = align64;
     }
 
+  /* SFmode, see if we want to use the VSX unit.  */
+  if (TARGET_P8_VECTOR)
+    {
+      rs6000_vector_unit[SFmode] = VECTOR_P8_VECTOR;
+      rs6000_vector_mem[SFmode]
+       = (TARGET_VSX_SCALAR_MEMORY ? VECTOR_P8_VECTOR : VECTOR_NONE);
+      rs6000_vector_align[SFmode] = align32;
+    }
+
   /* TODO add SPE and paired floating point vector support.  */
 
   /* Register class constraints for the constraints that depend on compile
@@ -3067,6 +3107,21 @@ rs6000_option_override_internal (bool gl
       rs6000_isa_flags &= ~OPTION_MASK_VSX_TIMODE;
     }
 
+  /* Enable power8 fusion if we are tuning for power8, even if we aren't
+     generating power8 instructions.  */
+  if (!(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION))
+    rs6000_isa_flags |= (processor_target_table[tune_index].target_enable
+                        & OPTION_MASK_P8_FUSION);
+
+  /* Power8 does not fuse sign extended loads with the addis.  If we are
+     optimizing at high levels for speed, convert a sign extended load into a
+     zero extending load, and an explicit sign extension.  */
+  if (TARGET_P8_FUSION
+      && !(rs6000_isa_flags_explicit & OPTION_MASK_P8_FUSION_SIGN)
+      && optimize_function_for_speed_p (cfun)
+      && optimize >= 3)
+    rs6000_isa_flags |= OPTION_MASK_P8_FUSION_SIGN;
+
   if (TARGET_DEBUG_REG || TARGET_DEBUG_TARGET)
     rs6000_print_isa_options (stderr, 0, "after defaults", rs6000_isa_flags);
 
@@ -28674,12 +28729,14 @@ static struct rs6000_opt_mask const rs60
 {
   { "altivec",                 OPTION_MASK_ALTIVEC,            false, true  },
   { "cmpb",                    OPTION_MASK_CMPB,               false, true  },
+  { "constrain-regs",          OPTION_MASK_CONSTRAIN_REGS,     false, false },
   { "crypto",                  OPTION_MASK_CRYPTO,             false, true  },
   { "direct-move",             OPTION_MASK_DIRECT_MOVE,        false, true  },
   { "dlmzb",                   OPTION_MASK_DLMZB,              false, true  },
   { "fprnd",                   OPTION_MASK_FPRND,              false, true  },
   { "hard-dfp",                        OPTION_MASK_DFP,                false, 
true  },
   { "isel",                    OPTION_MASK_ISEL,               false, true  },
+  { "lra",                     OPTION_MASK_LRA,                false, false },
   { "mfcrf",                   OPTION_MASK_MFCRF,              false, true  },
   { "mfpgpr",                  OPTION_MASK_MFPGPR,             false, true  },
   { "mulhw",                   OPTION_MASK_MULHW,              false, true  },
@@ -29683,6 +29740,254 @@ rs6000_set_up_by_prologue (struct hard_r
     add_to_hard_reg_set (&set->set, Pmode, RS6000_PIC_OFFSET_TABLE_REGNUM);
 }
 
+
+/* Enable/disable the LRA (local register allocator).  */
+
+static bool
+rs6000_lra_p (void)
+{
+  return TARGET_LRA;
+}
+
+
+/* Return true if the peephole2 can combine a load involving a combination of
+   an addis instruction and a load with an offset that can be fused together on
+   a power8.  */
+
+bool
+fusion_gpr_load_p (rtx addis_reg,      /* reg. to hold high value.  */
+                  rtx addis_value,     /* high value loaded.  */
+                  rtx target,          /* reg. that is loaded.  */
+                  rtx mem,             /* memory to load.  */
+                  rtx insn)            /* insn for looking up reg notes or
+                                          NULL_RTX if this is a peephole2.  */
+{
+  rtx addr;
+  rtx base_reg;
+
+  /* Validate arguments.  */
+  if (!base_reg_operand (addis_reg, GET_MODE (addis_reg)))
+    return false;
+
+  if (!base_reg_operand (target, GET_MODE (target)))
+    return false;
+
+  if (!fusion_gpr_addis (addis_value, GET_MODE (addis_value)))
+    return false;
+
+  if (!fusion_gpr_mem_load (mem, GET_MODE (mem)))
+    return false;
+
+  /* Validate that the register used to load the high value is either the
+     register being loaded, or we can safely replace its use in a peephole.
+
+     If this is a peephole2, we assume that there are 2 instructions in the
+     peephole (addis and load), so we want to check if the target register was
+     not used and the register to hold the addis result is dead after the
+     peephole.  */
+  if (REGNO (addis_reg) != REGNO (target))
+    {
+      if (reg_mentioned_p (target, mem))
+       return false;
+
+      if (insn)
+       {
+         if (!find_reg_note (insn, REG_DEAD, addis_reg))
+           return false;
+       }
+      else
+       {
+         if (!peep2_reg_dead_p (2, addis_reg))
+           return false;
+       }
+    }
+
+  /* Validate that the value being loaded in the addis is used in the load.  */
+  addr = XEXP (mem, 0);                        /* either PLUS or LO_SUM.  */
+  if (GET_CODE (addr) != PLUS && GET_CODE (addr) != LO_SUM)
+    return false;
+
+  base_reg = XEXP (addr, 0);
+  return REGNO (addis_reg) == REGNO (base_reg);
+}
+
+/* Return a string to fuse an addis instruction with a gpr load to the same
+   register that we loaded up the addis instruction.  The code is complicated,
+   so we call output_asm_insn directly, and just return "".  */
+
+const char *
+emit_fusion_gpr_load (rtx addis_reg, rtx addis_value, rtx target, rtx mem)
+{
+  rtx fuse_ops[10];
+  rtx addr;
+  rtx load_offset;
+  const char *addis_str = NULL;
+  const char *load_str = NULL;
+  const char *mode_name = NULL;
+  char insn_template[80];
+  enum machine_mode mode = GET_MODE (mem);
+  const char *comment_str = ASM_COMMENT_START;
+
+  if (*comment_str == ' ')
+    comment_str++;
+
+  if (!MEM_P (mem))
+    gcc_unreachable ();
+
+  addr = XEXP (mem, 0);
+  if (GET_CODE (addr) != PLUS && GET_CODE (addr) != LO_SUM)
+    gcc_unreachable ();
+
+  load_offset = XEXP (addr, 1);
+
+  /* Now emit the load instruction to the same register.  */
+  switch (mode)
+    {
+    case QImode:
+      mode_name = "char";
+      load_str = "lbz";
+      break;
+
+    case HImode:
+      mode_name = "short";
+      load_str = "lhz";
+      break;
+
+    case SImode:
+      mode_name = "int";
+      load_str = "lwz";
+      break;
+
+    case DImode:
+      if (TARGET_POWERPC64)
+       {
+         mode_name = "long";
+         load_str = "ld";
+       }
+      break;
+
+    default:
+      break;
+    }
+
+  if (!load_str)
+    gcc_unreachable ();
+
+  /* Emit the addis instruction.  */
+  fuse_ops[0] = target;
+  fuse_ops[1] = addis_reg;
+  if (satisfies_constraint_L (addis_value))
+    {
+      fuse_ops[2] = addis_value;
+      addis_str = "lis %0,%v2";
+    }
+
+  else if (GET_CODE (addis_value) == PLUS)
+    {
+      rtx op0 = XEXP (addis_value, 0);
+      rtx op1 = XEXP (addis_value, 1);
+
+      if (REG_P (op0) && CONST_INT_P (op1)
+         && satisfies_constraint_L (op1))
+       {
+         fuse_ops[2] = op0;
+         fuse_ops[3] = op1;
+         addis_str = "addis %0,%2,%v3";
+       }
+    }
+
+  else if (GET_CODE (addis_value) == HIGH)
+    {
+      rtx value = XEXP (addis_value, 0);
+      if (GET_CODE (value) == UNSPEC && XINT (value, 1) == UNSPEC_TOCREL)
+       {
+         fuse_ops[2] = XVECEXP (value, 0, 0);          /* symbol ref.  */
+         fuse_ops[3] = XVECEXP (value, 0, 1);          /* TOC register.  */
+         if (TARGET_ELF)
+           addis_str = "addis %0,%3,%2@toc@ha";
+
+         else if (TARGET_XCOFF)
+           addis_str = "addis %0,%2@u(%3)";
+       }
+
+      else if (GET_CODE (value) == PLUS)
+       {
+         rtx op0 = XEXP (value, 0);
+         rtx op1 = XEXP (value, 1);
+
+         if (GET_CODE (op0) == UNSPEC
+             && XINT (op0, 1) == UNSPEC_TOCREL
+             && CONST_INT_P (op1))
+           {
+             fuse_ops[2] = XVECEXP (op0, 0, 0);        /* symbol ref.  */
+             fuse_ops[3] = XVECEXP (op0, 0, 1);        /* TOC register.  */
+             fuse_ops[4] = op1;
+             if (TARGET_ELF)
+               addis_str = "addis %0,%3,%2+%4@toc@ha";
+
+             else if (TARGET_XCOFF)
+               addis_str = "addis %0,%2+%4@u(%3)";
+           }
+       }
+    }
+
+  if (!addis_str)
+    gcc_unreachable ();
+
+  sprintf (insn_template, "%s\t\t%s gpr load fusion, type %s, addis reg %%1",
+          addis_str, comment_str, mode_name);
+  output_asm_insn (insn_template, fuse_ops);
+
+  if (CONST_INT_P (load_offset) && satisfies_constraint_I (load_offset))
+    {
+      sprintf (insn_template, "%s %%0,%%1(%%0)", load_str);
+      fuse_ops[1] = load_offset;
+      output_asm_insn (insn_template, fuse_ops);
+    }
+
+  else if (GET_CODE (load_offset) == UNSPEC
+          && XINT (load_offset, 1) == UNSPEC_TOCREL)
+    {
+      if (TARGET_ELF)
+       sprintf (insn_template, "%s %%0,%%1@toc@l(%%0)", load_str);
+
+      else if (TARGET_XCOFF)
+       sprintf (insn_template, "%s %%0,%%1@l(%%0)", load_str);
+
+      else
+       gcc_unreachable ();
+
+      fuse_ops[1] = XVECEXP (load_offset, 0, 0);
+      output_asm_insn (insn_template, fuse_ops);
+    }
+
+  else if (GET_CODE (load_offset) == PLUS
+          && GET_CODE (XEXP (load_offset, 0)) == UNSPEC
+          && XINT (XEXP (load_offset, 0), 1) == UNSPEC_TOCREL
+          && CONST_INT_P (XEXP (load_offset, 1)))
+    {
+      rtx tocrel_unspec = XEXP (load_offset, 0);
+      if (TARGET_ELF)
+       sprintf (insn_template, "%s %%0,%%1+%%2@toc@l(%%0)", load_str);
+
+      else if (TARGET_XCOFF)
+       sprintf (insn_template, "%s %%0,%%1+%%2@l(%%0)", load_str);
+
+      else
+       gcc_unreachable ();
+
+      fuse_ops[1] = XVECEXP (tocrel_unspec, 0, 0);
+      fuse_ops[2] = XEXP (load_offset, 1);
+      output_asm_insn (insn_template, fuse_ops);
+    }
+
+  else
+    gcc_unreachable ();
+
+  return "";
+}
+
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-rs6000.h"
Index: gcc/config/rs6000/vsx.md
===================================================================
--- gcc/config/rs6000/vsx.md    (revision 199200)
+++ gcc/config/rs6000/vsx.md    (working copy)
@@ -1847,3 +1847,28 @@ (define_insn_and_split "*vsx_reduc_<VEC_
 }"
   [(set_attr "length" "20")
    (set_attr "type" "veccomplex")])
+
+
+;; Power8 Vector fusion
+;; Note, the fused ops must be adjacent, so don't split these ops
+(define_peephole
+  [(set (match_operand:P 0 "base_reg_operand" "")
+       (match_operand:P 1 "short_cint_operand" ""))
+   (set (match_operand:VSX_M2 2 "vsx_register_operand" "")
+       (mem:VSX_M2 (plus:P (match_dup 0)
+                           (match_operand:P 3 "int_reg_operand" ""))))]
+  "TARGET_P8_FUSION"
+  "li %0,%1\t\t# vector load fusion\;lx<VSX_M2:VSm>x %x2,%0,%3"  
+  [(set_attr "length" "8")
+   (set_attr "type" "vecload")])
+
+(define_peephole
+  [(set (match_operand:P 0 "base_reg_operand" "")
+       (match_operand:P 1 "short_cint_operand" ""))
+   (set (match_operand:VSX_M2 2 "vsx_register_operand" "")
+       (mem:VSX_M2 (plus:P (match_operand:P 3 "int_reg_operand" "")
+                           (match_dup 0))))]
+  "TARGET_P8_FUSION"
+  "li %0,%1\t\t# vector load fusion\;lx<VSX_M2:VSm>x %x2,%0,%3"  
+  [(set_attr "length" "8")
+   (set_attr "type" "vecload")])
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md (revision 199210)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -15237,6 +15237,117 @@ (define_insn "rs6000_mftb_<mode>"
 })
 
 
+;; Power8 fusion support for fusing an addis instruction with a D-form load of
+;; a GPR.  The addis instruction must be adjacent to the load, and use the same
+;; register that is being loaded.
+
+;; Note, the fused ops must be adjacent, so don't split these ops Originally
+;; these were written as define_peephole2's, but moved to define_peephole's so
+;; that it doesn't conflict with cse after reload in some cases.
+
+;; GPR fusion for single word integer types
+
+(define_peephole
+  [(set (match_operand:P 0 "base_reg_operand" "")
+       (match_operand:P 1 "fusion_gpr_addis" ""))
+   (set (match_operand:INT1 2 "base_reg_operand" "")
+       (match_operand:INT1 3 "fusion_gpr_mem_load" ""))]
+  "TARGET_P8_FUSION
+   && fusion_gpr_load_p (operands[0], operands[1], operands[2], operands[3],
+                        insn)"
+{
+  return emit_fusion_gpr_load (operands[0], operands[1], operands[2],
+                              operands[3]);
+}
+  [(set_attr "type" "load")
+   (set_attr "length" "8")])
+
+(define_peephole
+  [(set (match_operand:DI 0 "base_reg_operand" "")
+       (match_operand:DI 1 "fusion_gpr_addis" ""))
+   (set (match_operand:DI 2 "base_reg_operand" "")
+       (zero_extend:DI (match_operand:QHSI 3 "fusion_gpr_mem_load" "")))]
+  "TARGET_P8_FUSION && TARGET_POWERPC64
+   && fusion_gpr_load_p (operands[0], operands[1], operands[2], operands[3],
+                        insn)"
+{
+  return emit_fusion_gpr_load (operands[0], operands[1], operands[2],
+                              operands[3]);
+}
+  [(set_attr "type" "load")
+   (set_attr "length" "8")])
+
+;; Power8 does not fuse a sign extending load, so convert the sign extending
+;; load into a zero extending load, and do an explicit sign extension.  Don't
+;; do this if we are trying to optimize for space.  Do this as a peephole2 to
+;; allow final rtl optimizations and scheduling to move the sign extend.
+(define_peephole2
+  [(set (match_operand:DI 0 "base_reg_operand" "")
+       (match_operand:DI 1 "fusion_gpr_addis" ""))
+   (set (match_operand:DI 2 "base_reg_operand" "")
+       (sign_extend:DI (match_operand:HSI 3 "fusion_gpr_mem_load" "")))]
+  "TARGET_P8_FUSION && TARGET_P8_FUSION_SIGN && TARGET_POWERPC64
+   && fusion_gpr_load_p (operands[0], operands[1], operands[2], operands[3],
+                        NULL_RTX)"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 4) (match_dup 3))
+   (set (match_dup 2) (sign_extend:DI (match_dup 4)))]
+{
+  unsigned int offset
+    = (BYTES_BIG_ENDIAN ? 8 - GET_MODE_SIZE (<MODE>mode) : 0);
+
+  operands[4] = simplify_subreg (<MODE>mode, operands[2], DImode,
+                                offset);
+})
+
+(define_peephole
+  [(set (match_operand:P 0 "base_reg_operand" "")
+       (match_operand:P 1 "fusion_gpr_addis" ""))
+   (set (match_operand:SI 2 "base_reg_operand" "")
+       (zero_extend:SI (match_operand:QHI 3 "fusion_gpr_mem_load" "")))]
+  "TARGET_P8_FUSION
+   && fusion_gpr_load_p (operands[0], operands[1], operands[2], operands[3],
+                        insn)"
+{
+  return emit_fusion_gpr_load (operands[0], operands[1], operands[2],
+                              operands[3]);
+}
+  [(set_attr "type" "load")
+   (set_attr "length" "8")])
+
+(define_peephole2
+  [(set (match_operand:P 0 "base_reg_operand" "")
+       (match_operand:P 1 "fusion_gpr_addis" ""))
+   (set (match_operand:SI 2 "base_reg_operand" "")
+       (sign_extend:SI (match_operand:HI 3 "fusion_gpr_mem_load" "")))]
+  "TARGET_P8_FUSION && TARGET_P8_FUSION_SIGN
+   && fusion_gpr_load_p (operands[0], operands[1], operands[2], operands[3],
+                        NULL_RTX)"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 4) (match_dup 3))
+   (set (match_dup 2) (sign_extend:SI (match_dup 4)))]
+{
+  unsigned int offset = (BYTES_BIG_ENDIAN ? 2 : 0);
+
+  operands[4] = simplify_subreg (HImode, operands[2], SImode, offset);
+})
+
+(define_peephole
+  [(set (match_operand:P 0 "base_reg_operand" "")
+       (match_operand:P 1 "fusion_gpr_addis" ""))
+   (set (match_operand:HI 2 "base_reg_operand" "")
+       (zero_extend:HI (match_operand:QI 3 "fusion_gpr_mem_load" "")))]
+  "TARGET_P8_FUSION
+   && fusion_gpr_load_p (operands[0], operands[1], operands[2], operands[3],
+                        insn)"
+{
+  return emit_fusion_gpr_load (operands[0], operands[1], operands[2],
+                              operands[3]);
+}
+  [(set_attr "type" "load")
+   (set_attr "length" "8")])
+
+
 
 (include "sync.md")
 (include "vector.md")

Reply via email to