On 20 May 23:27, Vladimir Makarov wrote:
> 
> 
> On 20/05/15 04:17 AM, Ilya Enkovich wrote:
> >On 19 May 11:22, Vladimir Makarov wrote:
> >>On 05/18/2015 08:13 AM, Ilya Enkovich wrote:
> >>>2015-05-06 17:18 GMT+03:00 Ilya Enkovich <enkovich....@gmail.com>:
> >>>Hi Vladimir,
> >>>
> >>>Could you please comment on this?
> >>>
> >>>
> >>Ilya, I think that the idea is worth to try but results might be
> >>mixed.  It is hard to say until you actually try it (as example, Jan
> >>implemented -fpmath=both and it looks a pretty good idea at least
> >>for me but when I checked SPEC2000 the results were not so good even
> >>with IRA/LRA).
> >>
> >>Long ago I did some experiments and found that spilling into SSE
> >>would benefitial for Intel CPUs but not for AMD ones.  As I remember
> >>I also found that storing several scalar values into one SSE reg and
> >>extracting it when you need to do some (fp) arithmetics would
> >>benefitial for AMD but not for Intel CPUs.   In literature more
> >>general approach is called bitwise register allocator.  Actually it
> >>would be a pretty big IRA/LRA project from which some targets might
> >>benefit.
> >I suspect such things are not trivially done in IRA/LRA and want to make it 
> >as an independent optimization because its application seems to be quite 
> >narrow.
> Yes, that is true.  The complications and implementation complexity
> will be probably very high in this project and the positive results
> are not sure.  So the project might have a small value.
> >>
> >>As for the wrong code, it is hard for me to say anything w/o RA
> >>dumps.  If you send me the dump (-fira-verbose=16), i might say more
> >>what is going on.
> >>
> >>
> >Here are some dumps from my reproducer.  The problematic register is r108.
> >
> Thanks.  For me it looks like an inheritance bug.  It is really hard
> to fix the bug w/o the source code.  Could you send me your patch in
> order I can debug RA with it to investigate more.
> 

Sure! Here is a patch and a testcase.  I applied patch to r222125.  Cmd to 
reproduce:

gcc -m32 -msse4.2 -O2 pr65105.c -S -march=slm -fPIE

Thanks,
Ilya
void
counter (long long l);

void
test (long long *arr)
{
  register unsigned long long tmp;

  tmp = arr[0] | arr[1] & arr[2];
  while (tmp)
    {
      counter (tmp);
      tmp = *(arr++) & tmp;
    }
}
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index a607ef4..a9dbfea 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2554,6 +2554,789 @@ rest_of_handle_insert_vzeroupper (void)
   return 0;
 }
 
+static bool
+has_non_address_hard_reg (rtx_insn *insn)
+{
+  df_ref ref;
+  FOR_EACH_INSN_DEF (ref, insn)
+    if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
+       && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER))
+      return true;
+
+  FOR_EACH_INSN_USE (ref, insn)
+    if (!DF_REF_REG_MEM_P(ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
+      return true;
+
+  return false;
+}
+
+static bool
+scalar_to_vector_candidate_p (rtx_insn *insn)
+{
+  rtx def_set = single_set (insn);
+
+  if (!def_set)
+    return false;
+
+  if (has_non_address_hard_reg (insn))
+    return false;
+
+  rtx src = SET_SRC (def_set);
+  rtx dst = SET_DEST (def_set);
+
+  /* We are interested in DImode -> V1DI promotion
+     only.  */
+  if (GET_MODE (src) != DImode
+      || GET_MODE (dst) != DImode)
+    return false;
+
+  if (!REG_P (dst) && !MEM_P (dst))
+    return false;
+
+  switch (GET_CODE (src))
+    {
+    case PLUS:
+    case MINUS:
+    case IOR:
+    case XOR:
+    case AND:
+      break;
+
+    default:
+      return false;
+    }
+
+  if (!REG_P (XEXP (src, 0)) && !MEM_P (XEXP (src, 0)))
+      return false;
+
+  if (!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
+      return false;
+
+  if (GET_MODE (XEXP (src, 0)) != DImode
+      || GET_MODE (XEXP (src, 1)) != DImode)
+    return false;
+
+  return true;
+}
+
+/* Remove regs having both convertible and
+   not convertible definitions.  */
+static void
+remove_non_convertible_regs (bitmap insns)
+{
+  bitmap_iterator bi;
+  unsigned id;
+  bitmap regs = BITMAP_ALLOC (NULL);
+
+  EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
+    {
+      rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
+      rtx reg = SET_DEST (def_set);
+
+      if (!REG_P (reg) || bitmap_bit_p (regs, REGNO (reg)))
+       continue;
+
+      for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
+          def;
+          def = DF_REF_NEXT_REG (def))
+       {
+         if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
+           {
+             if (dump_file)
+               fprintf (dump_file,
+                        "r%d has non convertible definition in insn %d\n",
+                        REGNO (reg), DF_REF_INSN_UID (def));
+
+             bitmap_set_bit (regs, REGNO (reg));
+             break;
+           }
+       }
+    }
+
+  EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
+    {
+      for (df_ref def = DF_REG_DEF_CHAIN (id);
+          def;
+          def = DF_REF_NEXT_REG (def))
+       if (bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
+         {
+           if (dump_file)
+             fprintf (dump_file, "Removing insn %d from candidates list\n",
+                      DF_REF_INSN_UID (def));
+
+           bitmap_clear_bit (insns, DF_REF_INSN_UID (def));
+         }
+    }
+
+  BITMAP_FREE (regs);
+}
+
+static bool
+convertible_insn_p (rtx_insn *insn)
+{
+  rtx def_set = single_set (insn);
+
+  if (!def_set)
+    return false;
+
+  if (has_non_address_hard_reg (insn))
+    return false;
+
+  rtx src = SET_SRC (def_set);
+  rtx dst = SET_DEST (def_set);
+
+  if (GET_MODE (src) != DImode
+      || GET_MODE (dst) != DImode)
+    return false;
+
+  /* Convert simple loads, stores and register copies.  */
+  return ((REG_P (src) && MEM_P (dst))
+         || ((MEM_P (src) && REG_P (dst)))
+         || ((REG_P (src) && REG_P (dst))));
+}
+
+class scalar_chain
+{
+ public:
+  scalar_chain ();
+  ~scalar_chain ();
+
+  static unsigned max_id;
+
+  unsigned int chain_id;
+  bitmap queue;
+  bitmap insns;
+  bitmap defs;
+  bitmap defs_conv;
+
+  void build (bitmap candidates, unsigned insn_uid);
+  int compute_convert_gain ();
+  void convert ();
+
+ private:
+  void add_insn (bitmap candidates, unsigned insn_uid);
+  void add_to_queue (unsigned insn_uid);
+  void mark_dual_mode_def (df_ref def);
+  void analyze_register_chain (bitmap candidates, df_ref ref);
+  void convert_insn (rtx_insn *insn);
+  void convert_op (rtx *op, rtx_insn *insn);
+  void convert_insn_defs (unsigned regno);
+  void make_scalar_copies (unsigned regno);
+  void make_vector_copies (unsigned regno);
+};
+
+unsigned scalar_chain::max_id = 0;
+
+scalar_chain::scalar_chain ()
+{
+  chain_id = ++max_id;
+
+   if (dump_file)
+    fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
+
+  bitmap_obstack_initialize (NULL);
+  insns = BITMAP_ALLOC (NULL);
+  defs = BITMAP_ALLOC (NULL);
+  defs_conv = BITMAP_ALLOC (NULL);
+  queue = NULL;
+}
+
+scalar_chain::~scalar_chain ()
+{
+  BITMAP_FREE (insns);
+  BITMAP_FREE (defs);
+  BITMAP_FREE (defs_conv);
+  bitmap_obstack_release (NULL);
+}
+
+void
+scalar_chain::add_to_queue (unsigned insn_uid)
+{
+  if (bitmap_bit_p (insns, insn_uid)
+      || bitmap_bit_p (queue, insn_uid))
+    return;
+
+  if (dump_file)
+    fprintf (dump_file, "  Adding insn %d into chain's #%d queue\n",
+            insn_uid, chain_id);
+  bitmap_set_bit (queue, insn_uid);
+}
+
+void
+scalar_chain::mark_dual_mode_def (df_ref def)
+{
+  gcc_assert (DF_REF_REG_DEF_P (def));
+
+  if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
+    return;
+
+  if (dump_file)
+    fprintf (dump_file,
+            "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
+            DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
+
+  bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
+}
+
+void
+scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
+{
+  df_link *chain;
+  for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
+    {
+      unsigned uid = DF_REF_INSN_UID (chain->ref);
+      if (!DF_REF_REG_MEM_P (chain->ref))
+       {
+         if (bitmap_bit_p (insns, uid))
+           continue;
+
+         if (bitmap_bit_p (candidates, uid))
+           {
+             add_to_queue (uid);
+             continue;
+           }
+
+         if (!DF_REF_REG_MEM_P (chain->ref)
+             && convertible_insn_p (DF_REF_INSN (chain->ref)))
+           {
+             if (dump_file)
+               fprintf (dump_file, "  Mark insn %d as convertible\n", uid);
+             bitmap_set_bit (candidates, uid);
+             add_to_queue (uid);
+             continue;
+           }
+       }
+
+      if (DF_REF_REG_DEF_P (chain->ref))
+       {
+         if (dump_file)
+           fprintf (dump_file, "  r%d def in insn %d isn't convertible\n",
+                    DF_REF_REGNO (chain->ref), uid);
+         mark_dual_mode_def (chain->ref);
+       }
+      else
+       {
+         if (dump_file)
+           fprintf (dump_file, "  r%d use in insn %d isn't convertible\n",
+                    DF_REF_REGNO (chain->ref), uid);
+         mark_dual_mode_def (ref);
+       }
+    }
+}
+
+void
+scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
+{
+  if (bitmap_bit_p (insns, insn_uid))
+    return;
+
+  if (dump_file)
+    fprintf (dump_file, "  Adding insn %d to chain #%d\n", insn_uid, chain_id);
+
+  bitmap_set_bit (insns, insn_uid);
+
+  rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
+  rtx def_set = single_set (insn);
+  if (def_set && REG_P (SET_DEST (def_set)))
+    bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
+
+  df_ref ref;
+  for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
+    analyze_register_chain (candidates, ref);
+  for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
+    if (!DF_REF_REG_MEM_P (ref))
+      analyze_register_chain (candidates, ref);
+}
+
+void
+scalar_chain::build (bitmap candidates, unsigned insn_uid)
+{
+  queue = BITMAP_ALLOC (NULL);
+  bitmap_set_bit (queue, insn_uid);
+
+  if (dump_file)
+    fprintf (dump_file, "Building chain #%d...\n", chain_id);
+
+  while (!bitmap_empty_p (queue))
+    {
+      insn_uid = bitmap_first_set_bit (queue);
+      bitmap_clear_bit (queue, insn_uid);
+      bitmap_clear_bit (candidates, insn_uid);
+      add_insn (candidates, insn_uid);
+    }
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Collected chain #%d...\n", chain_id);
+      fprintf (dump_file, "  insns: ");
+      dump_bitmap (dump_file, insns);
+      if (!bitmap_empty_p (defs_conv))
+       {
+         bitmap_iterator bi;
+         unsigned id;
+         const char *comma = "";
+         fprintf (dump_file, "  defs to convert: ");
+         EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
+           {
+             fprintf (dump_file, "%sr%d", comma, id);
+             comma = ", ";
+           }
+         fprintf (dump_file, "\n");
+       }
+    }
+
+  BITMAP_FREE (queue);
+}
+
+int
+scalar_chain::compute_convert_gain ()
+{
+  bitmap_iterator bi;
+  unsigned insn_uid;
+  int gain = 0;
+  int cost = 0;
+
+  if (dump_file)
+    fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
+
+  EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
+    {
+      rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
+      rtx def_set = single_set (insn);
+      rtx src = SET_SRC (def_set);
+      rtx dst = SET_DEST (def_set);
+
+      if (REG_P (src) && REG_P (dst))
+       gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
+      else if (REG_P (src) && MEM_P (dst))
+       gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
+      else if (MEM_P (src) && REG_P (dst))
+       gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
+      else if (GET_CODE (src) == PLUS
+              || GET_CODE (src) == MINUS
+              || GET_CODE (src) == IOR
+              || GET_CODE (src) == XOR
+              || GET_CODE (src) == AND)
+       gain += ix86_cost->add;
+      else
+       gcc_unreachable ();
+    }
+
+  if (dump_file)
+    fprintf (dump_file, "  Instruction convertion gain: %d\n", gain);
+
+  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
+    cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
+
+  if (dump_file)
+    fprintf (dump_file, "  Registers convertion cost: %d\n", cost);
+
+  gain -= cost;
+
+  if (dump_file)
+    fprintf (dump_file, "  Total gain: %d\n", gain);
+
+  return gain;
+}
+
+void
+scalar_chain::make_vector_copies (unsigned regno)
+{
+  rtx reg = regno_reg_rtx[regno];
+  rtx sireg = gen_reg_rtx (SImode);
+  rtx vcopy = gen_rtx_SUBREG (V2DImode, sireg, 0);
+  rtx vcopy1 = gen_rtx_SUBREG (V4SImode, sireg, 0);
+  df_ref ref;
+
+  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+      {
+       start_sequence ();
+       if (TARGET_SSE4_1)
+         {
+           rtx tmp = gen_reg_rtx (SImode);
+           emit_move_insn (sireg, gen_rtx_SUBREG (SImode, reg, 0));
+           emit_insn (gen_sse4_1_pinsrd (vcopy1, vcopy1,
+                                         gen_rtx_SUBREG (SImode, reg, 4),
+                                         GEN_INT (2)));
+         }
+       else
+         {
+           rtx tmp = gen_reg_rtx (SImode);
+           emit_move_insn (sireg,
+                           gen_rtx_SUBREG (SImode, reg, 0));
+           emit_move_insn (tmp, gen_rtx_SUBREG (SImode, reg, 4));
+           emit_move_insn (gen_rtx_SUBREG (V2DImode, tmp, 0),
+                           gen_rtx_ASHIFT (V2DImode,
+                                           gen_rtx_SUBREG (V2DImode, tmp, 0),
+                                           GEN_INT (32)));
+           emit_move_insn (vcopy,
+                           gen_rtx_IOR (V2DImode, vcopy,
+                                        gen_rtx_SUBREG (V2DImode, tmp, 0)));
+         }
+       emit_insn_after (get_insns (), DF_REF_INSN (ref));
+       end_sequence ();
+
+       if (dump_file)
+         fprintf (dump_file,
+                  "  Copied r%d to a vector register r%d for insn %d\n",
+                  regno, REGNO (sireg), DF_REF_INSN_UID (ref));
+      }
+
+  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+      {
+       replace_rtx (DF_REF_INSN (ref), reg, vcopy);
+
+       if (dump_file)
+         fprintf (dump_file, "  Replaced r%d with r%d in insn %d\n",
+                  regno, REGNO (sireg), DF_REF_INSN_UID (ref));
+      }
+}
+
+void
+scalar_chain::convert_insn_defs (unsigned regno)
+{
+  bool scalar_copy = bitmap_bit_p (defs_conv, regno);
+  rtx reg = regno_reg_rtx[regno];
+  rtx new_reg = NULL_RTX;
+  rtx scopy = NULL_RTX;
+  df_ref ref;
+  bitmap conv;
+
+  conv = BITMAP_ALLOC (NULL);
+  bitmap_copy (conv, insns);
+
+  /* Check we have load or store.  In this case we cannot
+     just convert register to V2DI mode and have to use subreg.  */
+  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    {
+      rtx def_set = single_set (DF_REF_INSN (ref));
+      if (def_set && MEM_P (SET_SRC (def_set)))
+       {
+         new_reg = gen_rtx_SUBREG (V2DImode, reg, 0);
+         break;
+       }
+    }
+
+  if (!new_reg)
+    for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+      {
+       rtx def_set = single_set (DF_REF_INSN (ref));
+       if (def_set
+           && MEM_P (SET_DEST (def_set))
+           && REG_P (SET_SRC (def_set)))
+         {
+           new_reg = gen_rtx_SUBREG (V2DImode, reg, 0);
+           break;
+         }
+      }
+
+  if (!new_reg)
+    new_reg = gen_rtx_SUBREG (V2DImode, reg, 0);
+
+  if (scalar_copy)
+    scopy = gen_reg_rtx (DImode);
+
+  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    {
+      rtx def_set = single_set (DF_REF_INSN (ref));
+      rtx src = SET_SRC (def_set);
+      rtx reg = DF_REF_REG (ref);
+
+      if (!MEM_P (src))
+       {
+         replace_rtx (DF_REF_INSN (ref), reg, new_reg);
+         bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
+       }
+         
+      if (scalar_copy)
+       {
+         rtx vcopy = gen_reg_rtx (V2DImode);
+
+         start_sequence ();
+         emit_move_insn (vcopy, new_reg);
+         emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
+                         gen_rtx_SUBREG (SImode, vcopy, 0));
+         emit_move_insn (vcopy,
+                         gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
+         emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
+                         gen_rtx_SUBREG (SImode, vcopy, 0));
+         emit_insn_after (get_insns (), DF_REF_INSN (ref));
+         end_sequence ();
+
+         if (dump_file)
+           fprintf (dump_file,
+                    "  Copied r%d to a scalar register r%d for insn %d\n",
+                    regno, REGNO (scopy), DF_REF_INSN_UID (ref));
+       }
+    }
+
+  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+      {
+       if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
+         {
+           rtx def_set = single_set (DF_REF_INSN (ref));
+           if (!MEM_P (SET_DEST (def_set))
+               || !REG_P (SET_SRC (def_set)))
+             replace_rtx (DF_REF_INSN (ref), reg, new_reg);
+           bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
+         }
+      }
+    else
+      {
+       replace_rtx (DF_REF_INSN (ref), reg, scopy);
+       df_insn_rescan (DF_REF_INSN (ref));
+      }
+
+  BITMAP_FREE (conv);
+}
+
+void
+scalar_chain::make_scalar_copies (unsigned regno)
+{
+  rtx scopy = gen_reg_rtx (DImode);
+  rtx vcopy = gen_reg_rtx (V2DImode);
+  df_ref ref;
+
+  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+      {
+       rtx reg = DF_REF_REG (ref);
+
+       start_sequence ();
+       emit_move_insn (vcopy, reg);
+       emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
+                       gen_rtx_SUBREG (SImode, vcopy, 0));
+       emit_move_insn (vcopy,
+                       gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
+       emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
+                       gen_rtx_SUBREG (SImode, vcopy, 0));
+       emit_insn_after (get_insns (), DF_REF_INSN (ref));
+       end_sequence ();
+
+       if (dump_file)
+         fprintf (dump_file,
+                  "  Copied r%d to a scalar register r%d for insn %d\n",
+                  REGNO (reg), REGNO (scopy), DF_REF_INSN_UID (ref));
+      }
+
+  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+      {
+       replace_rtx (DF_REF_INSN (ref), DF_REF_REG (ref), scopy);
+       df_insn_rescan (DF_REF_INSN (ref));
+
+       if (dump_file)
+         fprintf (dump_file, "  Replaced r%d with r%d in insn %d\n",
+                  regno, REGNO (scopy), DF_REF_INSN_UID (ref));
+      }
+}
+
+void
+scalar_chain::convert_op (rtx *op, rtx_insn *insn)
+{
+  *op = copy_rtx_if_shared (*op);
+
+  if (MEM_P (*op))
+    {
+      rtx tmp = gen_reg_rtx (DImode);
+      rtx tmpv2di = gen_rtx_SUBREG (V2DImode, tmp, 0);
+
+      emit_insn_before (gen_move_insn (tmp, *op), insn);
+      *op = tmpv2di;
+
+      if (dump_file)
+       fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
+                INSN_UID (insn), REGNO (tmp));
+    }
+  else if (REG_P (*op))
+    {
+      //*op = gen_rtx_SUBREG (V2DImode, *op, 0);
+      gcc_assert (bitmap_bit_p
+                 (insns, DF_REF_INSN_UID (DF_REG_DEF_CHAIN (REGNO (*op))))
+                 || bitmap_bit_p (defs_conv, REGNO (*op)));
+    }
+  else
+    {
+      gcc_assert (SUBREG_P (*op));
+      gcc_assert (GET_MODE (*op) == V2DImode);
+    }
+}
+
+void
+scalar_chain::convert_insn (rtx_insn *insn)
+{
+  rtx def_set = single_set (insn);
+  rtx src = copy_rtx_if_shared (SET_SRC (def_set));
+  rtx dst = SET_DEST (def_set);
+
+  if (MEM_P (dst))
+    {
+      if (!REG_P (src))
+       {
+         /* There are no scalar integer instructions and therefore
+            temporary register usage is required.  */
+         rtx tmp = gen_reg_rtx (DImode);
+         emit_insn_after (gen_move_insn (dst, tmp), insn);
+         dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
+       }
+    }
+#if 0
+  else if (REG_P (dst))
+    {
+      df_ref def = DF_REG_DEF_CHAIN (REGNO (dst));
+
+      if (MEM_P (src))
+       {
+         df_link *link;
+         rtx subreg = gen_rtx_SUBREG (V2DImode, dst, 0);
+         for (link = DF_REF_CHAIN (def); link; link = link->next)
+           if (bitmap_bit_p (insns, DF_REF_INSN_UID (link->ref)))
+             {
+               /* replace_rtx dive into subreg and goes into recursion.  */
+               rtx tmp = gen_reg_rtx (VOIDmode);
+               replace_rtx (DF_REF_INSN (link->ref), dst, tmp);
+               replace_rtx (DF_REF_INSN (link->ref), tmp, subreg);
+
+               if (dump_file)
+                 fprintf (dump_file, "  Replace r%d with a subreg in insn 
%d\n",
+                          REGNO (dst), DF_REF_INSN_UID (link->ref));
+             }
+       }
+      else
+       {
+         PUT_MODE (dst, V2DImode);
+       }
+    }
+  else
+    gcc_unreachable ();
+#endif
+
+  switch (GET_CODE (src))
+    {
+    case PLUS:
+    case MINUS:
+    case IOR:
+    case XOR:
+    case AND:
+      convert_op (&XEXP (src, 0), insn);
+      convert_op (&XEXP (src, 1), insn);
+      PUT_MODE (src, V2DImode);
+      break;
+
+    case MEM:
+      if (!REG_P (dst))
+       convert_op (&src, insn);
+      break;
+
+    case REG:
+      break;
+
+    case SUBREG:
+      gcc_assert (GET_MODE (src) == V2DImode);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  SET_SRC (def_set) = src;
+  SET_DEST (def_set) = dst;
+
+  /* Drop possible dead definitions.  */
+  PATTERN (insn) = def_set;
+
+  INSN_CODE (insn) = -1;
+  recog_memoized (insn);
+  df_insn_rescan (insn);
+}
+
+void
+scalar_chain::convert ()
+{
+  bitmap_iterator bi;
+  unsigned id;
+
+  if (dump_file)
+    fprintf (dump_file, "Converting chain #%d...\n", chain_id);
+
+  EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
+    convert_insn_defs (id);
+
+  EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
+    convert_insn (DF_INSN_UID_GET (id)->insn);
+
+  EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
+    make_vector_copies (id);
+}
+
+static unsigned int
+convert_scalars_to_vector ()
+{
+  basic_block bb;
+  bitmap candidates;
+
+  bitmap_obstack_initialize (NULL);
+  candidates = BITMAP_ALLOC (NULL);
+
+  calculate_dominance_info (CDI_DOMINATORS);
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+  df_md_add_problem ();
+  df_analyze ();
+  
+  /* 1. Find all instructions we want to convert into vector mode.  */
+  if (dump_file)
+    fprintf (dump_file, "Searching for mode convertion candidates...\n");
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *insn;
+      FOR_BB_INSNS (bb, insn)
+       if (scalar_to_vector_candidate_p (insn))
+         {
+           if (dump_file)
+             fprintf (dump_file, "  insn %d is marked as a candidate\n",
+                      INSN_UID (insn));
+
+           bitmap_set_bit (candidates, INSN_UID (insn));
+         }
+    }
+
+  remove_non_convertible_regs (candidates);
+
+  if (bitmap_empty_p (candidates))
+    if (dump_file)
+      fprintf (dump_file, "There are no candidates for optimization.\n");
+
+  while (!bitmap_empty_p (candidates))
+    {
+      unsigned uid = bitmap_first_set_bit (candidates);
+      scalar_chain chain;
+
+      /* Find instructions chain we want to convert to vector mode.
+        Check all uses and definitions to estimate all required
+        convertions.  */
+      chain.build (candidates, uid);
+
+      if (chain.compute_convert_gain () > 0)
+       chain.convert ();
+      else
+       if (dump_file)
+         fprintf (dump_file, "Chain #%d convertion is not profitable\n",
+                  chain.chain_id);
+    }
+
+  BITMAP_FREE (candidates);
+  bitmap_obstack_release (NULL);
+  df_process_deferred_rescans ();
+  df_verify ();
+
+  return 0;
+}
+
 namespace {
 
 const pass_data pass_data_insert_vzeroupper =
@@ -2591,6 +3374,39 @@ public:
 
 }; // class pass_insert_vzeroupper
 
+const pass_data pass_data_stv =
+{
+  RTL_PASS, /* type */
+  "stv", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_stv : public rtl_opt_pass
+{
+public:
+  pass_stv (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_stv, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return !TARGET_64BIT && TARGET_SSE2 && optimize > 1;
+    }
+
+  virtual unsigned int execute (function *)
+    {
+      return convert_scalars_to_vector ();
+    }
+
+}; // class pass_stv
+
 } // anon namespace
 
 rtl_opt_pass *
@@ -2599,6 +3415,12 @@ make_pass_insert_vzeroupper (gcc::context *ctxt)
   return new pass_insert_vzeroupper (ctxt);
 }
 
+rtl_opt_pass *
+make_pass_stv (gcc::context *ctxt)
+{
+  return new pass_stv (ctxt);
+}
+
 /* Return true if a red-zone is in use.  */
 
 static inline bool
@@ -4363,12 +5185,18 @@ ix86_option_override (void)
     = { pass_insert_vzeroupper, "reload",
        1, PASS_POS_INSERT_AFTER
       };
+  opt_pass *pass_stv = make_pass_stv (g);
+  struct register_pass_info stv_info
+    = { pass_stv, "combine",
+       1, PASS_POS_INSERT_AFTER
+      };
 
   ix86_option_override_internal (true, &global_options, &global_options_set);
 
 
   /* This needs to be done at start up.  It's convenient to do it here.  */
   register_pass (&insert_vzeroupper_info);
+  register_pass (&stv_info);
 }
 
 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 7195882..6aae22c 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -972,6 +972,11 @@
                               (HI "TARGET_HIMODE_MATH")
                               SI])
 
+;; Math-dependant integer modes with DImode.
+(define_mode_iterator SWIM1248x [(QI "TARGET_QIMODE_MATH")
+                              (HI "TARGET_HIMODE_MATH")
+                              SI DI])
+
 ;; Math-dependant single word integer modes without QImode.
 (define_mode_iterator SWIM248 [(HI "TARGET_HIMODE_MATH")
                               SI (DI "TARGET_64BIT")])
@@ -7731,9 +7736,9 @@
 ;; it should be done with splitters.
 
 (define_expand "and<mode>3"
-  [(set (match_operand:SWIM 0 "nonimmediate_operand")
-       (and:SWIM (match_operand:SWIM 1 "nonimmediate_operand")
-                 (match_operand:SWIM 2 "<general_szext_operand>")))]
+  [(set (match_operand:SWIM1248x 0 "nonimmediate_operand")
+       (and:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")
+                     (match_operand:SWIM1248x 2 "<general_szext_operand>")))]
   ""
 {
   machine_mode mode = <MODE>mode;
@@ -7811,6 +7816,43 @@
        (const_string "*")))
    (set_attr "mode" "SI,DI,DI,SI,DI")])
 
+(define_insn_and_split "*anddi3_doubleword"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r")
+       (and:DI
+        (match_operand:DI 1 "nonimmediate_operand" "%0,0,0")
+        (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
+  "#"
+  "!TARGET_64BIT && reload_completed"
+  [(parallel [(set (match_dup 0)
+                  (and:SI (match_dup 1) (match_dup 2)))
+             (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 3)
+                  (and:SI (match_dup 4) (match_dup 5)))
+             (clobber (reg:CC FLAGS_REG))])]
+  "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);")
+
+(define_insn_and_split "*zext<mode>_doubleword"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+       (zero_extend:DI (match_operand:SWI24 1 "nonimmediate_operand" "rm")))]
+  "!TARGET_64BIT"
+  "#"
+  "!TARGET_64BIT && reload_completed"
+  [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
+   (set (match_dup 2) (const_int 0))]
+  "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
+
+(define_insn_and_split "*zextqi_doubleword"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+       (zero_extend:DI (match_operand:QI 1 "nonimmediate_operand" "qm")))]
+  "!TARGET_64BIT"
+  "#"
+  "!TARGET_64BIT && reload_completed"
+  [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
+   (set (match_dup 2) (const_int 0))]
+  "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
+
 (define_insn "*andsi_1"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r,Ya,!k")
        (and:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,qm,k")
@@ -8298,9 +8340,9 @@
 ;; If this is considered useful, it should be done with splitters.
 
 (define_expand "<code><mode>3"
-  [(set (match_operand:SWIM 0 "nonimmediate_operand")
-       (any_or:SWIM (match_operand:SWIM 1 "nonimmediate_operand")
-                    (match_operand:SWIM 2 "<general_operand>")))]
+  [(set (match_operand:SWIM1248x 0 "nonimmediate_operand")
+       (any_or:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")
+                            (match_operand:SWIM1248x 2 "<general_operand>")))]
   ""
   "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
 
@@ -8318,6 +8360,23 @@
   [(set_attr "type" "alu,alu,msklog")
    (set_attr "mode" "<MODE>")])
 
+(define_insn_and_split "*<code>di3_doubleword"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r")
+       (any_or:DI
+        (match_operand:DI 1 "nonimmediate_operand" "%0,0,0")
+        (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_64BIT && ix86_binary_operator_ok (<CODE>, DImode, operands)"
+  "#"
+  "!TARGET_64BIT && reload_completed"
+  [(parallel [(set (match_dup 0)
+                  (any_or:SI (match_dup 1) (match_dup 2)))
+             (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 3)
+                  (any_or:SI (match_dup 4) (match_dup 5)))
+             (clobber (reg:CC FLAGS_REG))])]
+  "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);")
+
 (define_insn "*<code>hi_1"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=r,rm,!k")
        (any_or:HI

Reply via email to