Hi Jan,

Here is a patch we've talked about recently - it merges expanders of memset and
memmov.  As a natural side effect, this enables vector_loop in memset expanding
as well.

Though in some places merging movmem and setmem isn't so efficient (the original
code in these versions differed a lot), I think it's worth combining them - in
many cases that allows to remove code duplicates.

I tried to keep the resultant code as close to the original as possible (except
enabling vector_loop in setmem).  Because of that, there are some places that
could be IMHO merged, but not merged in this patch.

The patch is bootstrapped and tested on i386/x86_64 (make check, and stability
testing on Spec2k, Spec2k6).

Is it ok?


Michael

---
 gcc/config/i386/i386.c                             | 1018 ++++++++------------
 .../gcc.target/i386/memset-vector_loop-1.c         |   11 +
 .../gcc.target/i386/memset-vector_loop-2.c         |   10 +
 3 files changed, 406 insertions(+), 633 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 21fc531..9d5654f 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -22219,13 +22219,16 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx 
srcmem,
   emit_label (out_label);
 }
 
-/* Output "rep; mov" instruction.
-   Arguments have same meaning as for previous function */
+/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
+   When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
+   When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
+   Other arguments have same meaning as for previous function.  */
+
 static void
-expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
-                          rtx destptr, rtx srcptr,
+expand_movmem_or_setmem_via_rep (rtx destmem, rtx srcmem,
+                          rtx destptr, rtx srcptr, rtx value, rtx orig_value,
                           rtx count,
-                          enum machine_mode mode)
+                          enum machine_mode mode, bool issetmem)
 {
   rtx destexp;
   rtx srcexp;
@@ -22233,82 +22236,65 @@ expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
   HOST_WIDE_INT rounded_count;
 
   /* If the size is known, it is shorter to use rep movs.  */
-  if (mode == QImode && CONST_INT_P (count)
+  if (!issetmem && mode == QImode && CONST_INT_P (count)
       && !(INTVAL (count) & 3))
     mode = SImode;
 
   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
-  if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
-    srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
-  countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE 
(mode)));
+
+  countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
+                                                      GET_MODE_SIZE (mode)));
   if (mode != QImode)
     {
       destexp = gen_rtx_ASHIFT (Pmode, countreg,
                                GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
-      srcexp = gen_rtx_ASHIFT (Pmode, countreg,
-                              GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
-      srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
     }
   else
-    {
-      destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
-      srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
-    }
-  if (CONST_INT_P (count))
+    destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
+  if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
     {
       rounded_count = (INTVAL (count)
                       & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
       destmem = shallow_copy_rtx (destmem);
-      srcmem = shallow_copy_rtx (srcmem);
       set_mem_size (destmem, rounded_count);
-      set_mem_size (srcmem, rounded_count);
-    }
-  else
-    {
-      if (MEM_SIZE_KNOWN_P (destmem))
-       clear_mem_size (destmem);
-      if (MEM_SIZE_KNOWN_P (srcmem))
-       clear_mem_size (srcmem);
     }
-  emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
-                         destexp, srcexp));
-}
-
-/* Output "rep; stos" instruction.
-   Arguments have same meaning as for previous function */
-static void
-expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
-                           rtx count, enum machine_mode mode,
-                           rtx orig_value)
-{
-  rtx destexp;
-  rtx countreg;
-  HOST_WIDE_INT rounded_count;
+  else if (MEM_SIZE_KNOWN_P (destmem))
+    clear_mem_size (destmem);
 
-  if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
-    destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
-  value = force_reg (mode, gen_lowpart (mode, value));
-  countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE 
(mode)));
-  if (mode != QImode)
+  if (issetmem)
     {
-      destexp = gen_rtx_ASHIFT (Pmode, countreg,
-                               GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
-      destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
+      value = force_reg (mode, gen_lowpart (mode, value));
+      emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
     }
   else
-    destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
-  if (orig_value == const0_rtx && CONST_INT_P (count))
     {
-      rounded_count = (INTVAL (count)
-                      & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
-      destmem = shallow_copy_rtx (destmem);
-      set_mem_size (destmem, rounded_count);
+      if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
+       srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
+      if (mode != QImode)
+       {
+         srcexp = gen_rtx_ASHIFT (Pmode, countreg,
+                                  GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
+         srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
+       }
+      else
+       srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
+      if (CONST_INT_P (count))
+       {
+         rounded_count = (INTVAL (count)
+                          & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
+         srcmem = shallow_copy_rtx (srcmem);
+         set_mem_size (srcmem, rounded_count);
+       }
+      else
+       {
+         if (MEM_SIZE_KNOWN_P (srcmem))
+           clear_mem_size (srcmem);
+       }
+      emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
+                             destexp, srcexp));
     }
-  else if (MEM_SIZE_KNOWN_P (destmem))
-    clear_mem_size (destmem);
-  emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
 }
 
 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
@@ -22496,6 +22482,57 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem,
     }
 }
 
+/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
+   with value PROMOTED_VAL.
+   SRC is passed by pointer to be updated on return.
+   Return value is updated DST.  */
+static rtx
+emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
+            HOST_WIDE_INT size_to_move)
+{
+  rtx dst = destmem, adjust;
+  enum insn_code code;
+  enum machine_mode move_mode;
+  int piece_size, i;
+
+  /* Find the widest mode in which we could perform moves.
+     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
+     it until move of such size is supported.  */
+  move_mode = GET_MODE (promoted_val);
+  if (size_to_move < GET_MODE_SIZE (move_mode))
+    {
+      move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
+      promoted_val = gen_lowpart (move_mode, promoted_val);
+    }
+  piece_size = GET_MODE_SIZE (move_mode);
+  code = optab_handler (mov_optab, move_mode);
+  gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
+
+  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
+
+  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
+  gcc_assert (size_to_move % piece_size == 0);
+  adjust = GEN_INT (piece_size);
+  for (i = 0; i < size_to_move; i += piece_size)
+    {
+      if (piece_size <= GET_MODE_SIZE (word_mode))
+       {
+         emit_insn (gen_strset (destptr, dst, promoted_val));
+         continue;
+       }
+
+      emit_insn (GEN_FCN (code) (dst, promoted_val));
+
+      emit_move_insn (destptr,
+                     gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+
+      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+                                         piece_size);
+    }
+
+  /* Update DST rtx.  */
+  return dst;
+}
 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  
*/
 static void
 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
@@ -22511,61 +22548,30 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx 
destptr, rtx value,
 
 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  
*/
 static void
-expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int 
max_size)
+expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
+                       rtx count, int max_size)
 {
   rtx dest;
 
   if (CONST_INT_P (count))
     {
       HOST_WIDE_INT countval = INTVAL (count);
-      int offset = 0;
+      HOST_WIDE_INT epilogue_size = countval % max_size;
+      int i;
 
-      if ((countval & 0x10) && max_size > 16)
-       {
-         if (TARGET_64BIT)
-           {
-             dest = adjust_automodify_address_nv (destmem, DImode, destptr, 
offset);
-             emit_insn (gen_strset (destptr, dest, value));
-             dest = adjust_automodify_address_nv (destmem, DImode, destptr, 
offset + 8);
-             emit_insn (gen_strset (destptr, dest, value));
-           }
-         else
-           gcc_unreachable ();
-         offset += 16;
-       }
-      if ((countval & 0x08) && max_size > 8)
+      /* For now MAX_SIZE should be a power of 2.  This assert could be
+        relaxed, but it'll require a bit more complicated epilogue
+        expanding.  */
+      gcc_assert ((max_size & (max_size - 1)) == 0);
+      for (i = max_size; i >= 1; i >>= 1)
        {
-         if (TARGET_64BIT)
-           {
-             dest = adjust_automodify_address_nv (destmem, DImode, destptr, 
offset);
-             emit_insn (gen_strset (destptr, dest, value));
-           }
-         else
+         if (epilogue_size & i)
            {
-             dest = adjust_automodify_address_nv (destmem, SImode, destptr, 
offset);
-             emit_insn (gen_strset (destptr, dest, value));
-             dest = adjust_automodify_address_nv (destmem, SImode, destptr, 
offset + 4);
-             emit_insn (gen_strset (destptr, dest, value));
+             if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
+               destmem = emit_memset (destmem, destptr, vec_value, i);
+             else
+               destmem = emit_memset (destmem, destptr, value, i);
            }
-         offset += 8;
-       }
-      if ((countval & 0x04) && max_size > 4)
-       {
-         dest = adjust_automodify_address_nv (destmem, SImode, destptr, 
offset);
-         emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
-         offset += 4;
-       }
-      if ((countval & 0x02) && max_size > 2)
-       {
-         dest = adjust_automodify_address_nv (destmem, HImode, destptr, 
offset);
-         emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
-         offset += 2;
-       }
-      if ((countval & 0x01) && max_size > 1)
-       {
-         dest = adjust_automodify_address_nv (destmem, QImode, destptr, 
offset);
-         emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
-         offset += 1;
        }
       return;
     }
@@ -22637,13 +22643,16 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx 
value, rtx count, int max_
     }
 }
 
-/* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
-   DESIRED_ALIGNMENT.
+/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
+   DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
+   Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
+   ignored.
    Return value is updated DESTMEM.  */
 static rtx
-expand_movmem_prologue (rtx destmem, rtx srcmem,
-                       rtx destptr, rtx srcptr, rtx count,
-                       int align, int desired_alignment)
+expand_movmem_or_setmem_prologue (rtx destmem, rtx srcmem,
+                                 rtx destptr, rtx srcptr, rtx value,
+                                 rtx vec_value, rtx count, int align,
+                                 int desired_alignment, bool issetmem)
 {
   int i;
   for (i = 1; i < desired_alignment; i <<= 1)
@@ -22651,7 +22660,15 @@ expand_movmem_prologue (rtx destmem, rtx srcmem,
       if (align <= i)
        {
          rtx label = ix86_expand_aligntest (destptr, i, false);
-         destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
+         if (issetmem)
+           {
+             if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
+               destmem = emit_memset (destmem, destptr, vec_value, i);
+             else
+               destmem = emit_memset (destmem, destptr, value, i);
+           }
+         else
+           destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
          ix86_adjust_counter (count, i);
          emit_label (label);
          LABEL_NUSES (label) = 1;
@@ -22661,22 +22678,28 @@ expand_movmem_prologue (rtx destmem, rtx srcmem,
   return destmem;
 }
 
-/* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
-   ALIGN_BYTES is how many bytes need to be copied.
-   The function updates DST and SRC, namely, it sets proper alignment.
-   DST is returned via return value, SRC is updated via pointer SRCP.  */
+/* This function is like the previous one, except here we know how many bytes
+   need to be copied.  That allows us to update alignment not only of DST, 
which
+   is returned, but also of SRC, which is passed as a pointer for that
+   reason.  */
 static rtx
-expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
-                                int desired_align, int align_bytes)
+expand_constant_movmem_or_setmem_prologue (rtx dst, rtx *srcp, rtx destreg,
+                                          rtx srcreg, rtx value, rtx vec_value,
+                                          int desired_align, int align_bytes,
+                                          bool issetmem)
 {
-  rtx src = *srcp;
+  rtx src = NULL;
   rtx orig_dst = dst;
-  rtx orig_src = src;
+  rtx orig_src = NULL;
   int piece_size = 1;
   int copied_bytes = 0;
-  int src_align_bytes = get_mem_align_offset (src, desired_align * 
BITS_PER_UNIT);
-  if (src_align_bytes >= 0)
-    src_align_bytes = desired_align - src_align_bytes;
+
+  if (!issetmem)
+    {
+      gcc_assert (srcp != NULL);
+      src = *srcp;
+      orig_src = src;
+    }
 
   for (piece_size = 1;
        piece_size <= desired_align && copied_bytes < align_bytes;
@@ -22684,109 +22707,48 @@ expand_constant_movmem_prologue (rtx dst, rtx *srcp, 
rtx destreg, rtx srcreg,
     {
       if (align_bytes & piece_size)
        {
-         dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
+         if (issetmem)
+           {
+             if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
+               dst = emit_memset (dst, destreg, vec_value, piece_size);
+             else
+               dst = emit_memset (dst, destreg, value, piece_size);
+           }
+         else
+           dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
          copied_bytes += piece_size;
        }
     }
-
   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
     set_mem_align (dst, desired_align * BITS_PER_UNIT);
-  if (src_align_bytes >= 0)
-    {
-      unsigned int src_align;
-      for (src_align = desired_align; src_align >= 2; src_align >>= 1)
-       {
-         if ((src_align_bytes & (src_align - 1))
-              == (align_bytes & (src_align - 1)))
-           break;
-       }
-      if (src_align > (unsigned int) desired_align)
-       src_align = desired_align;
-      if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
-       set_mem_align (src, src_align * BITS_PER_UNIT);
-    }
   if (MEM_SIZE_KNOWN_P (orig_dst))
     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
-  if (MEM_SIZE_KNOWN_P (orig_src))
-    set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
-  *srcp = src;
-  return dst;
-}
 
-/* Set enough from DEST to align DEST known to by aligned by ALIGN to
-   DESIRED_ALIGNMENT.  */
-static void
-expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
-                       int align, int desired_alignment)
-{
-  if (align <= 1 && desired_alignment > 1)
-    {
-      rtx label = ix86_expand_aligntest (destptr, 1, false);
-      destmem = change_address (destmem, QImode, destptr);
-      emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
-      ix86_adjust_counter (count, 1);
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-    }
-  if (align <= 2 && desired_alignment > 2)
+  if (!issetmem)
     {
-      rtx label = ix86_expand_aligntest (destptr, 2, false);
-      destmem = change_address (destmem, HImode, destptr);
-      emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
-      ix86_adjust_counter (count, 2);
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-    }
-  if (align <= 4 && desired_alignment > 4)
-    {
-      rtx label = ix86_expand_aligntest (destptr, 4, false);
-      destmem = change_address (destmem, SImode, destptr);
-      emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
-      ix86_adjust_counter (count, 4);
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
+      int src_align_bytes = get_mem_align_offset (src, desired_align
+                                                      * BITS_PER_UNIT);
+      if (src_align_bytes >= 0)
+       src_align_bytes = desired_align - src_align_bytes;
+      if (src_align_bytes >= 0)
+       {
+         unsigned int src_align;
+         for (src_align = desired_align; src_align >= 2; src_align >>= 1)
+           {
+             if ((src_align_bytes & (src_align - 1))
+                  == (align_bytes & (src_align - 1)))
+               break;
+           }
+         if (src_align > (unsigned int) desired_align)
+           src_align = desired_align;
+         if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
+           set_mem_align (src, src_align * BITS_PER_UNIT);
+       }
+      if (MEM_SIZE_KNOWN_P (orig_src))
+       set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
+      *srcp = src;
     }
-  gcc_assert (desired_alignment <= 8);
-}
 
-/* Set enough from DST to align DST known to by aligned by ALIGN to
-   DESIRED_ALIGN.  ALIGN_BYTES is how many bytes need to be stored.  */
-static rtx
-expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
-                                int desired_align, int align_bytes)
-{
-  int off = 0;
-  rtx orig_dst = dst;
-  if (align_bytes & 1)
-    {
-      dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
-      off = 1;
-      emit_insn (gen_strset (destreg, dst,
-                            gen_lowpart (QImode, value)));
-    }
-  if (align_bytes & 2)
-    {
-      dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
-      if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
-       set_mem_align (dst, 2 * BITS_PER_UNIT);
-      off = 2;
-      emit_insn (gen_strset (destreg, dst,
-                            gen_lowpart (HImode, value)));
-    }
-  if (align_bytes & 4)
-    {
-      dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
-      if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
-       set_mem_align (dst, 4 * BITS_PER_UNIT);
-      off = 4;
-      emit_insn (gen_strset (destreg, dst,
-                            gen_lowpart (SImode, value)));
-    }
-  dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
-  if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
-    set_mem_align (dst, desired_align * BITS_PER_UNIT);
-  if (MEM_SIZE_KNOWN_P (orig_dst))
-    set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
   return dst;
 }
 
@@ -22961,10 +22923,104 @@ decide_alignment (int align,
   return desired_align;
 }
 
-/* Expand string move (memcpy) operation.  Use i386 string operations
-   when profitable.  expand_setmem contains similar code.  The code
-   depends upon architecture, block size and alignment, but always has
-   the same overall structure:
+
+/* Helper function for memcpy.  For QImode value 0xXY produce
+   0xXYXYXYXY of wide specified by MODE.  This is essentially
+   a * 0x10101010, but we can do slightly better than
+   synth_mult by unwinding the sequence by hand on CPUs with
+   slow multiply.  */
+static rtx
+promote_duplicated_reg (enum machine_mode mode, rtx val)
+{
+  enum machine_mode valmode = GET_MODE (val);
+  rtx tmp;
+  int nops = mode == DImode ? 3 : 2;
+
+  gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
+  if (val == const0_rtx)
+    return copy_to_mode_reg (mode, CONST0_RTX (mode));
+  if (CONST_INT_P (val))
+    {
+      HOST_WIDE_INT v = INTVAL (val) & 255;
+
+      v |= v << 8;
+      v |= v << 16;
+      if (mode == DImode)
+        v |= (v << 16) << 16;
+      return copy_to_mode_reg (mode, gen_int_mode (v, mode));
+    }
+
+  if (valmode == VOIDmode)
+    valmode = QImode;
+  if (valmode != QImode)
+    val = gen_lowpart (QImode, val);
+  if (mode == QImode)
+    return val;
+  if (!TARGET_PARTIAL_REG_STALL)
+    nops--;
+  if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
+      + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
+      <= (ix86_cost->shift_const + ix86_cost->add) * nops
+          + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
+    {
+      rtx reg = convert_modes (mode, QImode, val, true);
+      tmp = promote_duplicated_reg (mode, const1_rtx);
+      return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
+                                 OPTAB_DIRECT);
+    }
+  else
+    {
+      rtx reg = convert_modes (mode, QImode, val, true);
+
+      if (!TARGET_PARTIAL_REG_STALL)
+       if (mode == SImode)
+         emit_insn (gen_movsi_insv_1 (reg, reg));
+       else
+         emit_insn (gen_movdi_insv_1 (reg, reg));
+      else
+       {
+         tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
+                                    NULL, 1, OPTAB_DIRECT);
+         reg =
+           expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+       }
+      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
+                                NULL, 1, OPTAB_DIRECT);
+      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+      if (mode == SImode)
+       return reg;
+      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
+                                NULL, 1, OPTAB_DIRECT);
+      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+      return reg;
+    }
+}
+
+/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
+   be needed by main loop copying SIZE_NEEDED chunks and prologue getting
+   alignment from ALIGN to DESIRED_ALIGN.  */
+static rtx
+promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
+                               int align)
+{
+  rtx promoted_val;
+
+  if (TARGET_64BIT
+      && (size_needed > 4 || (desired_align > align && desired_align > 4)))
+    promoted_val = promote_duplicated_reg (DImode, val);
+  else if (size_needed > 2 || (desired_align > align && desired_align > 2))
+    promoted_val = promote_duplicated_reg (SImode, val);
+  else if (size_needed > 1 || (desired_align > align && desired_align > 1))
+    promoted_val = promote_duplicated_reg (HImode, val);
+  else
+    promoted_val = val;
+
+  return promoted_val;
+}
+
+/* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
+   operations when profitable.  The code depends upon architecture, block size
+   and alignment, but always has the same overall structure:
 
    1) Prologue guard: Conditional that jumps up to epilogues for small
       blocks that can be handled by epilogue alone.  This is faster
@@ -22974,24 +23030,24 @@ decide_alignment (int align,
       Optional dynamic check for size and libcall for large
       blocks is emitted here too, with -minline-stringops-dynamically.
 
-   2) Prologue: copy first few bytes in order to get destination
+   2) Prologue: copy/set first few bytes in order to get destination
       aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
       than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
-      copied.  We emit either a jump tree on power of two sized
+      copied/set.  We emit either a jump tree on power of two sized
       blocks, or a byte loop.
 
-   3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
-      with specified algorithm.
+   3) Main body: the copying/storing loop itself, copying/storing in 
SIZE_NEEDED
+      chunks with specified algorithm.
 
-   4) Epilogue: code copying tail of the block that is too small to be
+   4) Epilogue: code copying/storing tail of the block that is too small to be
       handled by main body (or up to size guarded by prologue guard).  */
-
-bool
-ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
-                   rtx expected_align_exp, rtx expected_size_exp)
+static bool
+ix86_expand_movmem_or_setmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
+                             rtx align_exp, rtx expected_align_exp,
+                             rtx expected_size_exp, bool issetmem)
 {
   rtx destreg;
-  rtx srcreg;
+  rtx srcreg = NULL;
   rtx label = NULL;
   rtx tmp;
   rtx jump_around_label = NULL;
@@ -23001,6 +23057,9 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, 
rtx align_exp,
   int size_needed = 0, epilogue_size_needed;
   int desired_align = 0, align_bytes = 0;
   enum stringop_alg alg;
+  rtx promoted_val = NULL;
+  rtx vec_promoted_val = NULL;
+  bool force_loopy_epilogue = false;
   int dynamic_check;
   bool need_zero_guard = false;
   bool noalign;
@@ -23015,7 +23074,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, 
rtx align_exp,
     align = INTVAL (expected_align_exp);
   /* ALIGN is the minimum of destination and source alignment, but we care here
      just about destination alignment.  */
-  else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
+  else if (!issetmem
+          && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
 
   if (CONST_INT_P (count_exp))
@@ -23029,15 +23089,21 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, 
rtx align_exp,
 
   /* Step 0: Decide on preferred algorithm, desired alignment and
      size of chunks to be copied by main loop.  */
-  alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
+  alg = decide_alg (count, expected_size, issetmem, &dynamic_check, &noalign);
   if (alg == libcall)
     return false;
   gcc_assert (alg != no_stringop);
 
+  /* For now vector-version of memset is generated only for memory zeroing, as
+     creating of promoted vector value is very cheap in this case.  */
+  if (issetmem && alg == vector_loop && val_exp != const0_rtx)
+    alg = unrolled_loop;
+
   if (!count)
     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
   destreg = copy_addr_to_reg (XEXP (dst, 0));
-  srcreg = copy_addr_to_reg (XEXP (src, 0));
+  if (!issetmem)
+    srcreg = copy_addr_to_reg (XEXP (src, 0));
 
   unroll_factor = 1;
   move_mode = word_mode;
@@ -23115,14 +23181,39 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, 
rtx align_exp,
     }
   gcc_assert (desired_align >= 1 && align >= 1);
 
-  /* Ensure that alignment prologue won't copy past end of block.  */
-  if (size_needed > 1 || (desired_align > 1 && desired_align > align))
-    {
-      epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
+  /* Do the cheap promotion to allow better CSE across the
+     main loop and epilogue (ie one load of the big constant in the
+     front of all code.  */
+  if (issetmem && CONST_INT_P (val_exp))
+    {
+      if (alg == vector_loop)
+       {
+         gcc_assert (val_exp == const0_rtx);
+         vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
+         promoted_val = promote_duplicated_reg_to_size (val_exp,
+                                                        GET_MODE_SIZE 
(word_mode),
+                                                        desired_align, align);
+       }
+      else
+       {
+         promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+                                                        desired_align, align);
+       }
+    }
+  /* Ensure that alignment prologue won't copy past end of block.  */
+  if (size_needed > 1 || (desired_align > 1 && desired_align > align))
+    {
+      epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
         Make sure it is power of 2.  */
       epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
 
+      /* To improve performance of small blocks, we jump around the VAL
+        promoting mode.  This mean that if the promoted VAL is not constant,
+        we might not use it in the epilogue and have to use byte
+        loop variant.  */
+      if (issetmem && epilogue_size_needed > 2 && !promoted_val)
+       force_loopy_epilogue = true;
       if (count)
        {
          if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
@@ -23152,7 +23243,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, 
rtx align_exp,
      used.  */
   if (dynamic_check != -1)
     {
-      if (CONST_INT_P (count_exp))
+      if (!issetmem && CONST_INT_P (count_exp))
        {
          if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
            {
@@ -23168,13 +23259,20 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, 
rtx align_exp,
          emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
                                   LEU, 0, GET_MODE (count_exp), 1, hot_label);
          predict_jump (REG_BR_PROB_BASE * 90 / 100);
-         emit_block_move_via_libcall (dst, src, count_exp, false);
+         if (issetmem)
+           set_storage_via_libcall (dst, count_exp, val_exp, false);
+         else
+           emit_block_move_via_libcall (dst, src, count_exp, false);
          emit_jump (jump_around_label);
          emit_label (hot_label);
        }
     }
 
   /* Step 2: Alignment prologue.  */
+  /* Do the expensive promotion once we branched off the small blocks.  */
+  if (issetmem && !promoted_val)
+    promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+                                                  desired_align, align);
 
   if (desired_align > align)
     {
@@ -23184,17 +23282,26 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, 
rtx align_exp,
             constant offset in aliasing info.  It don't seems to worth
             the pain to maintain it for the first move, so throw away
             the info early.  */
-         src = change_address (src, BLKmode, srcreg);
          dst = change_address (dst, BLKmode, destreg);
-         dst = expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, 
align,
-                                       desired_align);
+         if (!issetmem)
+           src = change_address (src, BLKmode, srcreg);
+         dst = expand_movmem_or_setmem_prologue (dst, src, destreg, srcreg,
+                                           promoted_val, vec_promoted_val,
+                                           count_exp, align, desired_align,
+                                           issetmem);
        }
       else
        {
          /* If we know how many bytes need to be stored before dst is
             sufficiently aligned, maintain aliasing info accurately.  */
-         dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
-                                                desired_align, align_bytes);
+         dst = expand_constant_movmem_or_setmem_prologue (dst, &src, destreg,
+                                                          srcreg,
+                                                          promoted_val,
+                                                          vec_promoted_val,
+                                                          desired_align,
+                                                          align_bytes,
+                                                          issetmem);
+
          count_exp = plus_constant (counter_mode (count_exp),
                                     count_exp, -align_bytes);
          count -= align_bytes;
@@ -23226,6 +23333,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, 
rtx align_exp,
       LABEL_NUSES (label) = 1;
       label = NULL;
       epilogue_size_needed = 1;
+      if (issetmem)
+       promoted_val = val_exp;
     }
   else if (label == NULL_RTX)
     epilogue_size_needed = size_needed;
@@ -23241,29 +23350,35 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, 
rtx align_exp,
     case loop_1_byte:
     case loop:
     case unrolled_loop:
-    case vector_loop:
-      expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
+      expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
                                     count_exp, move_mode, unroll_factor,
                                     expected_size);
       break;
+    case vector_loop:
+      expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
+                                    vec_promoted_val, count_exp, move_mode,
+                                    unroll_factor, expected_size);
+      break;
     case rep_prefix_8_byte:
     case rep_prefix_4_byte:
     case rep_prefix_1_byte:
-      expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
-                                move_mode);
+      expand_movmem_or_setmem_via_rep (dst, src, destreg, srcreg, promoted_val,
+                                      val_exp, count_exp, move_mode, issetmem);
       break;
     }
   /* Adjust properly the offset of src and dest memory for aliasing.  */
   if (CONST_INT_P (count_exp))
     {
-      src = adjust_automodify_address_nv (src, BLKmode, srcreg,
-                                         (count / size_needed) * size_needed);
+      if (!issetmem)
+       src = adjust_automodify_address_nv (src, BLKmode, srcreg,
+                                           (count / size_needed) * 
size_needed);
       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
                                          (count / size_needed) * size_needed);
     }
   else
     {
-      src = change_address (src, BLKmode, srcreg);
+      if (!issetmem)
+       src = change_address (src, BLKmode, srcreg);
       dst = change_address (dst, BLKmode, destreg);
     }
 
@@ -23272,7 +23387,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, 
rtx align_exp,
   if (label)
     {
       /* When the main loop is done, COUNT_EXP might hold original count,
-        while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
+        while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
         Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
         bytes. Compensate if needed.  */
 
@@ -23290,408 +23405,45 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, 
rtx align_exp,
     }
 
   if (count_exp != const0_rtx && epilogue_size_needed > 1)
-    expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
-                           epilogue_size_needed);
-  if (jump_around_label)
-    emit_label (jump_around_label);
-  return true;
-}
-
-/* Helper function for memcpy.  For QImode value 0xXY produce
-   0xXYXYXYXY of wide specified by MODE.  This is essentially
-   a * 0x10101010, but we can do slightly better than
-   synth_mult by unwinding the sequence by hand on CPUs with
-   slow multiply.  */
-static rtx
-promote_duplicated_reg (enum machine_mode mode, rtx val)
-{
-  enum machine_mode valmode = GET_MODE (val);
-  rtx tmp;
-  int nops = mode == DImode ? 3 : 2;
-
-  gcc_assert (mode == SImode || mode == DImode);
-  if (val == const0_rtx)
-    return copy_to_mode_reg (mode, const0_rtx);
-  if (CONST_INT_P (val))
     {
-      HOST_WIDE_INT v = INTVAL (val) & 255;
-
-      v |= v << 8;
-      v |= v << 16;
-      if (mode == DImode)
-        v |= (v << 16) << 16;
-      return copy_to_mode_reg (mode, gen_int_mode (v, mode));
-    }
-
-  if (valmode == VOIDmode)
-    valmode = QImode;
-  if (valmode != QImode)
-    val = gen_lowpart (QImode, val);
-  if (mode == QImode)
-    return val;
-  if (!TARGET_PARTIAL_REG_STALL)
-    nops--;
-  if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
-      + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
-      <= (ix86_cost->shift_const + ix86_cost->add) * nops
-          + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
-    {
-      rtx reg = convert_modes (mode, QImode, val, true);
-      tmp = promote_duplicated_reg (mode, const1_rtx);
-      return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
-                                 OPTAB_DIRECT);
-    }
-  else
-    {
-      rtx reg = convert_modes (mode, QImode, val, true);
-
-      if (!TARGET_PARTIAL_REG_STALL)
-       if (mode == SImode)
-         emit_insn (gen_movsi_insv_1 (reg, reg));
-       else
-         emit_insn (gen_movdi_insv_1 (reg, reg));
+      if (force_loopy_epilogue)
+       expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
+                                        epilogue_size_needed);
       else
        {
-         tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
-                                    NULL, 1, OPTAB_DIRECT);
-         reg =
-           expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+         if (issetmem)
+           expand_setmem_epilogue (dst, destreg, promoted_val,
+                                   vec_promoted_val, count_exp,
+                                   epilogue_size_needed);
+         else
+           expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
+                                   epilogue_size_needed);
        }
-      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
-                                NULL, 1, OPTAB_DIRECT);
-      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
-      if (mode == SImode)
-       return reg;
-      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
-                                NULL, 1, OPTAB_DIRECT);
-      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
-      return reg;
     }
+  if (jump_around_label)
+    emit_label (jump_around_label);
+  return true;
 }
 
-/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
-   be needed by main loop copying SIZE_NEEDED chunks and prologue getting
-   alignment from ALIGN to DESIRED_ALIGN.  */
-static rtx
-promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, 
int align)
+/* Wrapper for ix86_expand_movmem_or_setmem for memcpy case.  */
+bool
+ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
+                   rtx expected_align_exp, rtx expected_size_exp)
 {
-  rtx promoted_val;
-
-  if (TARGET_64BIT
-      && (size_needed > 4 || (desired_align > align && desired_align > 4)))
-    promoted_val = promote_duplicated_reg (DImode, val);
-  else if (size_needed > 2 || (desired_align > align && desired_align > 2))
-    promoted_val = promote_duplicated_reg (SImode, val);
-  else if (size_needed > 1 || (desired_align > align && desired_align > 1))
-    promoted_val = promote_duplicated_reg (HImode, val);
-  else
-    promoted_val = val;
-
-  return promoted_val;
+  return ix86_expand_movmem_or_setmem (dst, src, count_exp, NULL, align_exp,
+                   expected_align_exp, expected_size_exp, false);
 }
 
-/* Expand string clear operation (bzero).  Use i386 string operations when
-   profitable.  See expand_movmem comment for explanation of individual
-   steps performed.  */
+/* Wrapper for ix86_expand_movmem_or_setmem for memset case.  */
 bool
 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
                    rtx expected_align_exp, rtx expected_size_exp)
 {
-  rtx destreg;
-  rtx label = NULL;
-  rtx tmp;
-  rtx jump_around_label = NULL;
-  HOST_WIDE_INT align = 1;
-  unsigned HOST_WIDE_INT count = 0;
-  HOST_WIDE_INT expected_size = -1;
-  int size_needed = 0, epilogue_size_needed;
-  int desired_align = 0, align_bytes = 0;
-  enum stringop_alg alg;
-  rtx promoted_val = NULL;
-  bool force_loopy_epilogue = false;
-  int dynamic_check;
-  bool need_zero_guard = false;
-  bool noalign;
-  enum machine_mode move_mode = VOIDmode;
-  int unroll_factor;
-
-  if (CONST_INT_P (align_exp))
-    align = INTVAL (align_exp);
-  /* i386 can do misaligned access on reasonably increased cost.  */
-  if (CONST_INT_P (expected_align_exp)
-      && INTVAL (expected_align_exp) > align)
-    align = INTVAL (expected_align_exp);
-  if (CONST_INT_P (count_exp))
-    count = expected_size = INTVAL (count_exp);
-  if (CONST_INT_P (expected_size_exp) && count == 0)
-    expected_size = INTVAL (expected_size_exp);
-
-  /* Make sure we don't need to care about overflow later on.  */
-  if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
-    return false;
-
-  /* Step 0: Decide on preferred algorithm, desired alignment and
-     size of chunks to be copied by main loop.  */
-
-  alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
-  if (alg == libcall)
-    return false;
-  gcc_assert (alg != no_stringop);
-
-  if (!count)
-    count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
-  destreg = copy_addr_to_reg (XEXP (dst, 0));
-
-  move_mode = word_mode;
-  unroll_factor = 1;
-  switch (alg)
-    {
-    case libcall:
-    case no_stringop:
-    case last_alg:
-      gcc_unreachable ();
-    case loop:
-      need_zero_guard = true;
-      break;
-    case vector_loop:
-    case unrolled_loop:
-      need_zero_guard = true;
-      unroll_factor = 4;
-      break;
-    case rep_prefix_8_byte:
-      move_mode = DImode;
-      break;
-    case rep_prefix_4_byte:
-      move_mode = SImode;
-      break;
-    case rep_prefix_1_byte:
-      move_mode = QImode;
-      break;
-    case loop_1_byte:
-      need_zero_guard = true;
-      move_mode = QImode;
-      break;
-    }
-  size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
-  epilogue_size_needed = size_needed;
-
-  desired_align = decide_alignment (align, alg, expected_size, move_mode);
-  if (!TARGET_ALIGN_STRINGOPS || noalign)
-    align = desired_align;
-
-  /* Step 1: Prologue guard.  */
-
-  /* Alignment code needs count to be in register.  */
-  if (CONST_INT_P (count_exp) && desired_align > align)
-    {
-      if (INTVAL (count_exp) > desired_align
-         && INTVAL (count_exp) > size_needed)
-       {
-         align_bytes
-           = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
-         if (align_bytes <= 0)
-           align_bytes = 0;
-         else
-           align_bytes = desired_align - align_bytes;
-       }
-      if (align_bytes == 0)
-       {
-         enum machine_mode mode = SImode;
-         if (TARGET_64BIT && (count & ~0xffffffff))
-           mode = DImode;
-         count_exp = force_reg (mode, count_exp);
-       }
-    }
-  /* Do the cheap promotion to allow better CSE across the
-     main loop and epilogue (ie one load of the big constant in the
-     front of all code.  */
-  if (CONST_INT_P (val_exp))
-    promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
-                                                  desired_align, align);
-  /* Ensure that alignment prologue won't copy past end of block.  */
-  if (size_needed > 1 || (desired_align > 1 && desired_align > align))
-    {
-      epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
-      /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
-        Make sure it is power of 2.  */
-      epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
-
-      /* To improve performance of small blocks, we jump around the VAL
-        promoting mode.  This mean that if the promoted VAL is not constant,
-        we might not use it in the epilogue and have to use byte
-        loop variant.  */
-      if (epilogue_size_needed > 2 && !promoted_val)
-        force_loopy_epilogue = true;
-      if (count)
-       {
-         if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
-           {
-             /* If main algorithm works on QImode, no epilogue is needed.
-                For small sizes just don't align anything.  */
-             if (size_needed == 1)
-               desired_align = align;
-             else
-               goto epilogue;
-           }
-       }
-      else
-       {
-         label = gen_label_rtx ();
-         emit_cmp_and_jump_insns (count_exp,
-                                  GEN_INT (epilogue_size_needed),
-                                  LTU, 0, counter_mode (count_exp), 1, label);
-         if (expected_size == -1 || expected_size <= epilogue_size_needed)
-           predict_jump (REG_BR_PROB_BASE * 60 / 100);
-         else
-           predict_jump (REG_BR_PROB_BASE * 20 / 100);
-       }
-    }
-  if (dynamic_check != -1)
-    {
-      rtx hot_label = gen_label_rtx ();
-      jump_around_label = gen_label_rtx ();
-      emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
-                              LEU, 0, counter_mode (count_exp), 1, hot_label);
-      predict_jump (REG_BR_PROB_BASE * 90 / 100);
-      set_storage_via_libcall (dst, count_exp, val_exp, false);
-      emit_jump (jump_around_label);
-      emit_label (hot_label);
-    }
-
-  /* Step 2: Alignment prologue.  */
-
-  /* Do the expensive promotion once we branched off the small blocks.  */
-  if (!promoted_val)
-    promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
-                                                  desired_align, align);
-  gcc_assert (desired_align >= 1 && align >= 1);
-
-  if (desired_align > align)
-    {
-      if (align_bytes == 0)
-       {
-         /* Except for the first move in epilogue, we no longer know
-            constant offset in aliasing info.  It don't seems to worth
-            the pain to maintain it for the first move, so throw away
-            the info early.  */
-         dst = change_address (dst, BLKmode, destreg);
-         expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
-                                 desired_align);
-       }
-      else
-       {
-         /* If we know how many bytes need to be stored before dst is
-            sufficiently aligned, maintain aliasing info accurately.  */
-         dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
-                                                desired_align, align_bytes);
-         count_exp = plus_constant (counter_mode (count_exp),
-                                    count_exp, -align_bytes);
-         count -= align_bytes;
-       }
-      if (need_zero_guard
-         && (count < (unsigned HOST_WIDE_INT) size_needed
-             || (align_bytes == 0
-                 && count < ((unsigned HOST_WIDE_INT) size_needed
-                             + desired_align - align))))
-       {
-         /* It is possible that we copied enough so the main loop will not
-            execute.  */
-         gcc_assert (size_needed > 1);
-         if (label == NULL_RTX)
-           label = gen_label_rtx ();
-         emit_cmp_and_jump_insns (count_exp,
-                                  GEN_INT (size_needed),
-                                  LTU, 0, counter_mode (count_exp), 1, label);
-         if (expected_size == -1
-             || expected_size < (desired_align - align) / 2 + size_needed)
-           predict_jump (REG_BR_PROB_BASE * 20 / 100);
-         else
-           predict_jump (REG_BR_PROB_BASE * 60 / 100);
-       }
-    }
-  if (label && size_needed == 1)
-    {
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-      label = NULL;
-      promoted_val = val_exp;
-      epilogue_size_needed = 1;
-    }
-  else if (label == NULL_RTX)
-    epilogue_size_needed = size_needed;
-
-  /* Step 3: Main loop.  */
-
-  switch (alg)
-    {
-    case libcall:
-    case no_stringop:
-    case last_alg:
-      gcc_unreachable ();
-    case loop_1_byte:
-    case loop:
-    case vector_loop:
-    case unrolled_loop:
-      expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
-                                    count_exp, move_mode, unroll_factor,
-                                    expected_size);
-      break;
-    case rep_prefix_8_byte:
-      expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
-                                 DImode, val_exp);
-      break;
-    case rep_prefix_4_byte:
-      expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
-                                 SImode, val_exp);
-      break;
-    case rep_prefix_1_byte:
-      expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
-                                 QImode, val_exp);
-      break;
-    }
-  /* Adjust properly the offset of src and dest memory for aliasing.  */
-  if (CONST_INT_P (count_exp))
-    dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
-                                       (count / size_needed) * size_needed);
-  else
-    dst = change_address (dst, BLKmode, destreg);
-
-  /* Step 4: Epilogue to copy the remaining bytes.  */
-
-  if (label)
-    {
-      /* When the main loop is done, COUNT_EXP might hold original count,
-        while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
-        Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
-        bytes. Compensate if needed.  */
-
-      if (size_needed < epilogue_size_needed)
-       {
-         tmp =
-           expand_simple_binop (counter_mode (count_exp), AND, count_exp,
-                                GEN_INT (size_needed - 1), count_exp, 1,
-                                OPTAB_DIRECT);
-         if (tmp != count_exp)
-           emit_move_insn (count_exp, tmp);
-       }
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-    }
- epilogue:
-  if (count_exp != const0_rtx && epilogue_size_needed > 1)
-    {
-      if (force_loopy_epilogue)
-       expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
-                                        epilogue_size_needed);
-      else
-       expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
-                               epilogue_size_needed);
-    }
-  if (jump_around_label)
-    emit_label (jump_around_label);
-  return true;
+  return ix86_expand_movmem_or_setmem (dst, NULL, count_exp, val_exp, 
align_exp,
+                     expected_align_exp, expected_size_exp, true);
 }
 
+
 /* Expand the appropriate insns for doing strlen if not just doing
    repnz; scasb
 
diff --git a/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c 
b/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c
new file mode 100644
index 0000000..ad0d130
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-vector_loop-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -minline-all-stringops 
-mstringop-strategy=vector_loop" } */
+/* { dg-final { scan-assembler-times "movdqa" 4 } } */
+
+char a[2048];
+void t (void)
+{
+  __builtin_memset (a, 0, 2048);
+}
+
+
diff --git a/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c 
b/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c
new file mode 100644
index 0000000..f2ceb44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-vector_loop-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -minline-all-stringops 
-mstringop-strategy=vector_loop" } */
+/* { dg-final { scan-assembler-times "movdqa" 4} } */
+
+char *a;
+void t (void)
+{
+  __builtin_memset (a, 0, 2048);
+}
+
-- 
1.7.11.7


Reply via email to