On 01/18/2016 10:22 AM, Richard Biener wrote:
See also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52171 - the
inline expansion
for small sizes and equality compares should be done on GIMPLE.  Today the
strlen pass might be an appropriate place to do this given its
superior knowledge
about string lengths.

The idea of turning eq feeding memcmp into a special memcmp_eq is good but
you have to avoid doing that too early - otherwise you'd lose on

   res = memcmp (p, q, sz);
   if (memcmp (p, q, sz) == 0)
    ...

that is, you have to make sure CSE got the chance to common the two calls.
This is why I think this kind of transform needs to happen in specific places
(like during strlen opt) rather than in generic folding.

Ok, here's an update. I kept pieces of your patch from that PR, but also translating memcmps larger than a single operation into memcmp_eq as in my previous patch.

Then, I added by_pieces infrastructure for memcmp expansion. To avoid any more code duplication in this area, I abstracted the existing code and converted it to C++ classes since that seemed to fit pretty well.

There are a few possible ways I could go with this, which is why I'm posting it more as a RFD at this point.
 - should store_by_pieces be eliminated in favour of doing just
   move_by_pieces with constfns?
 - the C++ification could be extended, making move_by_pieces_d and
   compare_by_pieces_d classes inheriting from a common base. This
   would get rid of the callbacks, replacing them with virtuals,
   and also make some of the current struct members private.
 - could move all of the by_pieces stuff out into a separate file?

Later, I think we'll also want to extend this to allow vector mode operations, but I think that's a separate patch.

So, opinions what I should be doing with this patch? FWIW it bootstraps and tests OK on x86_64-linux.


Bernd
Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c	(revision 235474)
+++ gcc/builtins.c	(working copy)
@@ -3671,53 +3671,24 @@ expand_cmpstr (insn_code icode, rtx targ
   return NULL_RTX;
 }
 
-/* Try to expand cmpstrn or cmpmem operation ICODE with the given operands.
-   ARG3_TYPE is the type of ARG3_RTX.  Return the result rtx on success,
-   otherwise return null.  */
-
-static rtx
-expand_cmpstrn_or_cmpmem (insn_code icode, rtx target, rtx arg1_rtx,
-			  rtx arg2_rtx, tree arg3_type, rtx arg3_rtx,
-			  HOST_WIDE_INT align)
-{
-  machine_mode insn_mode = insn_data[icode].operand[0].mode;
-
-  if (target && (!REG_P (target) || HARD_REGISTER_P (target)))
-    target = NULL_RTX;
-
-  struct expand_operand ops[5];
-  create_output_operand (&ops[0], target, insn_mode);
-  create_fixed_operand (&ops[1], arg1_rtx);
-  create_fixed_operand (&ops[2], arg2_rtx);
-  create_convert_operand_from (&ops[3], arg3_rtx, TYPE_MODE (arg3_type),
-			       TYPE_UNSIGNED (arg3_type));
-  create_integer_operand (&ops[4], align);
-  if (maybe_expand_insn (icode, 5, ops))
-    return ops[0].value;
-  return NULL_RTX;
-}
-
 /* Expand expression EXP, which is a call to the memcmp built-in function.
    Return NULL_RTX if we failed and the caller should emit a normal call,
-   otherwise try to get the result in TARGET, if convenient.  */
+   otherwise try to get the result in TARGET, if convenient.
+   RESULT_EQ is true if we can relax the returned value to be either zero
+   or nonzero, without caring about the sign.  */
 
 static rtx
-expand_builtin_memcmp (tree exp, rtx target)
+expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
 {
   if (!validate_arglist (exp,
  			 POINTER_TYPE, POINTER_TYPE, INTEGER_TYPE, VOID_TYPE))
     return NULL_RTX;
 
-  /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
-     implementing memcmp because it will stop if it encounters two
-     zero bytes.  */
-  insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
-  if (icode == CODE_FOR_nothing)
-    return NULL_RTX;
-
   tree arg1 = CALL_EXPR_ARG (exp, 0);
   tree arg2 = CALL_EXPR_ARG (exp, 1);
   tree len = CALL_EXPR_ARG (exp, 2);
+  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+  location_t loc = EXPR_LOCATION (exp);
 
   unsigned int arg1_align = get_pointer_alignment (arg1) / BITS_PER_UNIT;
   unsigned int arg2_align = get_pointer_alignment (arg2) / BITS_PER_UNIT;
@@ -3726,22 +3697,39 @@ expand_builtin_memcmp (tree exp, rtx tar
   if (arg1_align == 0 || arg2_align == 0)
     return NULL_RTX;
 
-  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
-  location_t loc = EXPR_LOCATION (exp);
   rtx arg1_rtx = get_memory_rtx (arg1, len);
   rtx arg2_rtx = get_memory_rtx (arg2, len);
-  rtx arg3_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
+  rtx len_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
 
   /* Set MEM_SIZE as appropriate.  */
-  if (CONST_INT_P (arg3_rtx))
+  if (CONST_INT_P (len_rtx))
     {
-      set_mem_size (arg1_rtx, INTVAL (arg3_rtx));
-      set_mem_size (arg2_rtx, INTVAL (arg3_rtx));
+      set_mem_size (arg1_rtx, INTVAL (len_rtx));
+      set_mem_size (arg2_rtx, INTVAL (len_rtx));
     }
 
-  rtx result = expand_cmpstrn_or_cmpmem (icode, target, arg1_rtx, arg2_rtx,
-					 TREE_TYPE (len), arg3_rtx,
-					 MIN (arg1_align, arg2_align));
+  by_pieces_constfn constfn = NULL;
+
+  const char *src_str = c_getstr (arg1);
+  if (src_str == NULL)
+    src_str = c_getstr (arg2);
+  else
+    std::swap (arg1_rtx, arg2_rtx);
+
+  /* If SRC is a string constant and block move would be done
+     by pieces, we can avoid loading the string from memory
+     and only stored the computed constants.  */
+  if (src_str
+      && CONST_INT_P (len_rtx)
+      && (unsigned HOST_WIDE_INT) INTVAL (len_rtx) <= strlen (src_str) + 1)
+    constfn = builtin_memcpy_read_str;
+
+  rtx result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
+				     TREE_TYPE (len), target,
+				     result_eq, result_eq, constfn,
+				     CONST_CAST (char *, src_str));
+
+
   if (result)
     {
       /* Return the value in the proper mode for this function.  */
@@ -3757,19 +3745,6 @@ expand_builtin_memcmp (tree exp, rtx tar
       return convert_to_mode (mode, result, 0);
     }
 
-  result = target;
-  if (! (result != 0
-	 && REG_P (result) && GET_MODE (result) == mode
-	 && REGNO (result) >= FIRST_PSEUDO_REGISTER))
-    result = gen_reg_rtx (mode);
-
-  emit_library_call_value (memcmp_libfunc, result, LCT_PURE,
-			   TYPE_MODE (integer_type_node), 3,
-			   XEXP (arg1_rtx, 0), Pmode,
-			   XEXP (arg2_rtx, 0), Pmode,
-			   convert_to_mode (TYPE_MODE (sizetype), arg3_rtx,
-					    TYPE_UNSIGNED (sizetype)),
-			   TYPE_MODE (sizetype));
   return result;
 }
 
@@ -5997,9 +5972,16 @@ expand_builtin (tree exp, rtx target, rt
 
     case BUILT_IN_BCMP:
     case BUILT_IN_MEMCMP:
-      target = expand_builtin_memcmp (exp, target);
+    case BUILT_IN_MEMCMP_EQ:
+      target = expand_builtin_memcmp (exp, target, fcode == BUILT_IN_MEMCMP_EQ);
       if (target)
 	return target;
+      if (fcode == BUILT_IN_MEMCMP_EQ)
+	{
+	  exp = copy_node (exp);
+	  tree newdecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
+	  TREE_OPERAND (exp, 1) = build_fold_addr_expr (newdecl);
+	}
       break;
 
     case BUILT_IN_SETJMP:
Index: gcc/builtins.def
===================================================================
--- gcc/builtins.def	(revision 235474)
+++ gcc/builtins.def	(working copy)
@@ -864,6 +864,10 @@ DEF_BUILTIN_STUB (BUILT_IN_STACK_SAVE, "
 DEF_BUILTIN_STUB (BUILT_IN_STACK_RESTORE, "__builtin_stack_restore")
 DEF_BUILTIN_STUB (BUILT_IN_ALLOCA_WITH_ALIGN, "__builtin_alloca_with_align")
 
+/* An internal version of memcmp, used when the result is only tested for
+   equality with zero.  */
+DEF_BUILTIN_STUB (BUILT_IN_MEMCMP_EQ, "__builtin_memcmp_eq")
+
 /* Object size checking builtins.  */
 DEF_GCC_BUILTIN	       (BUILT_IN_OBJECT_SIZE, "object_size", BT_FN_SIZE_CONST_PTR_INT, ATTR_PURE_NOTHROW_LEAF_LIST)
 DEF_EXT_LIB_BUILTIN_CHKP (BUILT_IN_MEMCPY_CHK, "__memcpy_chk", BT_FN_PTR_PTR_CONST_PTR_SIZE_SIZE, ATTR_RET1_NOTHROW_NONNULL_LEAF)
Index: gcc/defaults.h
===================================================================
--- gcc/defaults.h	(revision 235474)
+++ gcc/defaults.h	(working copy)
@@ -1039,6 +1039,11 @@ see the files COPYING3 and COPYING.RUNTI
 #define STORE_MAX_PIECES  MIN (MOVE_MAX_PIECES, 2 * sizeof (HOST_WIDE_INT))
 #endif
 
+/* Likewise for block comparisons.  */
+#ifndef COMPARE_MAX_PIECES
+#define COMPARE_MAX_PIECES  MOVE_MAX_PIECES
+#endif
+
 #ifndef MAX_MOVE_MAX
 #define MAX_MOVE_MAX MOVE_MAX
 #endif
Index: gcc/doc/tm.texi
===================================================================
--- gcc/doc/tm.texi	(revision 235484)
+++ gcc/doc/tm.texi	(working copy)
@@ -6311,8 +6311,9 @@ Both @var{size} and @var{alignment} are
 units.
 
 The parameter @var{op} is one of: @code{CLEAR_BY_PIECES},
-@code{MOVE_BY_PIECES}, @code{SET_BY_PIECES}, @code{STORE_BY_PIECES}.
-These describe the type of memory operation under consideration.
+@code{MOVE_BY_PIECES}, @code{SET_BY_PIECES}, @code{STORE_BY_PIECES} or
+@code{COMPARE_BY_PIECES}.  These describe the type of memory operation
+under consideration.
 
 The parameter @var{speed_p} is true if the code is currently being
 optimized for speed rather than size.
@@ -6329,11 +6330,31 @@ in code size, for example where the numb
 move would be greater than that of a library call.
 @end deftypefn
 
+@deftypefn {Target Hook} int TARGET_COMPARE_BY_PIECES_BRANCH_RATIO (machine_mode @var{mode})
+When expanding a block comparison in MODE, gcc can try to reduce the
+number of branches at the expense of more memory operations.  This hook
+allows the target to override the default choice.  It should return the
+factor by which branches should be reduced over the plain expansion with
+one comparison per @var{mode}-sized piece.
+@end deftypefn
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
 @end defmac
 
+@defmac STORE_MAX_PIECES
+A C expression used by @code{store_by_pieces} to determine the largest unit
+a store used to memory is.  Defaults to @code{MOVE_MAX_PIECES}, or two times
+the size of @code{HOST_WIDE_INT}, whichever is smaller.
+@end defmac
+
+@defmac COMPARE_MAX_PIECES
+A C expression used by @code{compare_by_pieces} to determine the largest unit
+a load or store used to compare memory is.  Defaults to
+@code{MOVE_MAX_PIECES}.
+@end defmac
+
 @defmac CLEAR_RATIO (@var{speed})
 The threshold of number of scalar move insns, @emph{below} which a sequence
 of insns should be generated to clear memory instead of a string clear insn
Index: gcc/doc/tm.texi.in
===================================================================
--- gcc/doc/tm.texi.in	(revision 235484)
+++ gcc/doc/tm.texi.in	(working copy)
@@ -4653,11 +4653,25 @@ If you don't define this, a reasonable d
 
 @hook TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
 
+@hook TARGET_COMPARE_BY_PIECES_BRANCH_RATIO
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
 @end defmac
 
+@defmac STORE_MAX_PIECES
+A C expression used by @code{store_by_pieces} to determine the largest unit
+a store used to memory is.  Defaults to @code{MOVE_MAX_PIECES}, or two times
+the size of @code{HOST_WIDE_INT}, whichever is smaller.
+@end defmac
+
+@defmac COMPARE_MAX_PIECES
+A C expression used by @code{compare_by_pieces} to determine the largest unit
+a load or store used to compare memory is.  Defaults to
+@code{MOVE_MAX_PIECES}.
+@end defmac
+
 @defmac CLEAR_RATIO (@var{speed})
 The threshold of number of scalar move insns, @emph{below} which a sequence
 of insns should be generated to clear memory instead of a string clear insn
Index: gcc/expr.c
===================================================================
--- gcc/expr.c	(revision 235474)
+++ gcc/expr.c	(working copy)
@@ -70,23 +70,6 @@ along with GCC; see the file COPYING3.
    the same indirect address eventually.  */
 int cse_not_expected;
 
-/* This structure is used by move_by_pieces to describe the move to
-   be performed.  */
-struct move_by_pieces_d
-{
-  rtx to;
-  rtx to_addr;
-  int autinc_to;
-  int explicit_inc_to;
-  rtx from;
-  rtx from_addr;
-  int autinc_from;
-  int explicit_inc_from;
-  unsigned HOST_WIDE_INT len;
-  HOST_WIDE_INT offset;
-  int reverse;
-};
-
 /* This structure is used by store_by_pieces to describe the clear to
    be performed.  */
 
@@ -103,8 +86,6 @@ struct store_by_pieces_d
   int reverse;
 };
 
-static void move_by_pieces_1 (insn_gen_fn, machine_mode,
-			      struct move_by_pieces_d *);
 static bool block_move_libcall_safe_for_call_parm (void);
 static bool emit_block_move_via_movmem (rtx, rtx, rtx, unsigned, unsigned, HOST_WIDE_INT,
 					unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
@@ -769,177 +750,342 @@ widest_int_mode_for_size (unsigned int s
   return mode;
 }
 
+/* Determine whether an operation OP on LEN bytes with alignment ALIGN can
+   and should be performed piecewise.  */
+
+static bool
+can_do_by_pieces (unsigned HOST_WIDE_INT len, unsigned int align,
+		  enum by_pieces_operation op)
+{
+  return targetm.use_by_pieces_infrastructure_p (len, align, op,
+						 optimize_insn_for_speed_p ());
+}
+
 /* Determine whether the LEN bytes can be moved by using several move
    instructions.  Return nonzero if a call to move_by_pieces should
    succeed.  */
 
-int
-can_move_by_pieces (unsigned HOST_WIDE_INT len,
-		    unsigned int align)
+bool
+can_move_by_pieces (unsigned HOST_WIDE_INT len, unsigned int align)
 {
-  return targetm.use_by_pieces_infrastructure_p (len, align, MOVE_BY_PIECES,
-						 optimize_insn_for_speed_p ());
+  return can_do_by_pieces (len, align, MOVE_BY_PIECES);
 }
 
-/* Generate several move instructions to copy LEN bytes from block FROM to
-   block TO.  (These are MEM rtx's with BLKmode).
+/* Used when performing piecewise block operations, holds information
+   about one of the memory objects involved.  The member functions
+   can be used to generate code for loading from the object and
+   updating the address when iterating.  */
 
-   If PUSH_ROUNDING is defined and TO is NULL, emit_single_push_insn is
-   used to push FROM to the stack.
+struct pieces_addr
+{
+  /* The object being referenced, a MEM.  Can be NULL_RTX to indicate
+     stack pushes.  */
+  rtx m_obj;
+  /* The address of the object.  Can differ from that seen in the
+     MEM rtx if we copied the address to a register.  */
+  rtx m_addr;
+  /* Nonzero if the address on the object has an autoincrement already,
+     signifies whether that was an increment or decrement.  */
+  signed char m_addr_inc;
+  /* Nonzero if we intend to use autoinc without the address already
+     having autoinc form.  We will insert add insns around each memory
+     reference, expecting later passes to form autoinc addressing modes.
+     The only supported options are predecrement and postincrement.  */
+  signed char m_explicit_inc;
+  /* True if we have either of the two possible cases of using
+     autoincrement.  */
+  bool m_auto;
+  /* True if this is an address to be used for load operations rather
+     than stores.  */
+  bool m_is_load;
 
-   ALIGN is maximum stack alignment we can assume.
+  /* Optionally, a function to obtain constants for any given offset into
+     the objects, and data associated with it.  */
+  by_pieces_constfn m_constfn;
+  void *m_cfndata;
+public:
+  pieces_addr (rtx, bool, by_pieces_constfn, void *);
+  rtx adjust (machine_mode, HOST_WIDE_INT);
+  void increment_address (HOST_WIDE_INT);
+  void maybe_predec (HOST_WIDE_INT);
+  void maybe_postinc (HOST_WIDE_INT);
+  void decide_autoinc (machine_mode, bool, HOST_WIDE_INT);
+  int get_addr_inc ()
+  {
+    return m_addr_inc;
+  }
+};
 
-   If ENDP is 0 return to, if ENDP is 1 return memory at the end ala
-   mempcpy, and if ENDP is 2 return memory the end minus one byte ala
-   stpcpy.  */
+/* Initialize a pieces_addr structure from an object OBJ.  If this is
+   NULL, we assume that it is intended to describe a stack push.
+   IS_LOAD is true if the operation to be performed on this object is
+   a load rather than a store.  If it is, then the optional CONSTFN
+   and its associated CFNDATA can be used in place of the memory
+   load.  */
 
-rtx
-move_by_pieces (rtx to, rtx from, unsigned HOST_WIDE_INT len,
-		unsigned int align, int endp)
+pieces_addr::pieces_addr (rtx obj, bool is_load, by_pieces_constfn constfn,
+			  void *cfndata)
+  : m_obj (obj), m_is_load (is_load), m_constfn (constfn), m_cfndata (cfndata)
 {
-  struct move_by_pieces_d data;
-  machine_mode to_addr_mode;
-  machine_mode from_addr_mode = get_address_mode (from);
-  rtx to_addr, from_addr = XEXP (from, 0);
-  unsigned int max_size = MOVE_MAX_PIECES + 1;
-  enum insn_code icode;
-
-  align = MIN (to ? MEM_ALIGN (to) : align, MEM_ALIGN (from));
-
-  data.offset = 0;
-  data.from_addr = from_addr;
-  if (to)
+  if (obj)
     {
-      to_addr_mode = get_address_mode (to);
-      to_addr = XEXP (to, 0);
-      data.to = to;
-      data.autinc_to
-	= (GET_CODE (to_addr) == PRE_INC || GET_CODE (to_addr) == PRE_DEC
-	   || GET_CODE (to_addr) == POST_INC || GET_CODE (to_addr) == POST_DEC);
-      data.reverse
-	= (GET_CODE (to_addr) == PRE_DEC || GET_CODE (to_addr) == POST_DEC);
+      rtx addr = XEXP (obj, 0);
+      rtx_code code = GET_CODE (addr);
+      m_addr = addr;
+      bool dec = code == PRE_DEC || code == POST_DEC;
+      bool inc = code == PRE_INC || code == POST_INC;
+      m_auto = inc || dec;
+      if (m_auto)
+	m_addr_inc = dec ? -1 : 1;
+      else
+	m_addr_inc = 0;
+      /* While we have always looked for these codes here, the code
+	 implementing the memory operation has never handled them.
+	 Support could be added later if necessary or beneficial.  */
+      gcc_assert (code != PRE_INC && code != POST_DEC);
     }
   else
     {
-      to_addr_mode = VOIDmode;
-      to_addr = NULL_RTX;
-      data.to = NULL_RTX;
-      data.autinc_to = 1;
+      m_addr = NULL_RTX;
+      m_auto = true;
       if (STACK_GROWS_DOWNWARD)
-	data.reverse = 1;
+	m_addr_inc = -1;
       else
-	data.reverse = 0;
+	m_addr_inc = 1;
     }
-  data.to_addr = to_addr;
-  data.from = from;
-  data.autinc_from
-    = (GET_CODE (from_addr) == PRE_INC || GET_CODE (from_addr) == PRE_DEC
-       || GET_CODE (from_addr) == POST_INC
-       || GET_CODE (from_addr) == POST_DEC);
+  m_explicit_inc = 0;
+  if (constfn)
+    gcc_assert (is_load);
+}
 
-  data.explicit_inc_from = 0;
-  data.explicit_inc_to = 0;
-  if (data.reverse) data.offset = len;
-  data.len = len;
+/* Decide whether to use autoinc for an address involved in a memory op.
+   MODE is the mode of the accesses, REVERSE is true if we've decided to
+   perform the operation starting from the end, and LEN is the length of
+   the operation.  Don't override an earlier decision to set m_auto.  */
+
+void
+pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse,
+			     HOST_WIDE_INT len)
+{
+  if (m_auto)
+    return;
+
+  bool use_predec = (m_is_load
+		     ? USE_LOAD_PRE_DECREMENT (mode)
+		     : USE_STORE_PRE_DECREMENT (mode));
+  bool use_postinc = (m_is_load
+		      ? USE_LOAD_POST_INCREMENT (mode)
+		      : USE_STORE_POST_INCREMENT (mode));
+  machine_mode addr_mode = get_address_mode (m_obj);
+
+  if (use_predec && reverse)
+    {
+      m_addr = copy_to_mode_reg (addr_mode,
+				 plus_constant (addr_mode,
+						m_addr, len));
+      m_auto = true;
+      m_explicit_inc = -1;
+    }
+  else if (use_postinc && !reverse)
+    {
+      m_addr = copy_to_mode_reg (addr_mode, m_addr);
+      m_auto = true;
+      m_explicit_inc = 1;
+    }
+  else if (CONSTANT_P (m_addr))
+    m_addr = copy_to_mode_reg (addr_mode, m_addr);
+}
+
+/* Adjust the address to refer to the data at OFFSET in MODE.  If we
+   are using autoincrement for this address, we don't add the offset,
+   but we still modify the MEM's properties.  */
+
+rtx
+pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset)
+{
+  if (m_obj == NULL_RTX)
+    return NULL_RTX;
+  if (m_constfn)
+    return m_constfn (m_cfndata, offset, mode);
+  if (m_auto)
+    return adjust_automodify_address (m_obj, mode, m_addr, offset);
+  else
+    return adjust_address (m_obj, mode, offset);
+}
+
+/* Emit an add instruction to increment the address by SIZE.  */
+
+void
+pieces_addr::increment_address (HOST_WIDE_INT size)
+{
+  rtx amount = gen_int_mode (size, GET_MODE (m_addr));
+  emit_insn (gen_add2_insn (m_addr, amount));
+}
+
+/* If we are supposed to decrement the address after each access, emit code
+   to do so now.  */
+
+void
+pieces_addr::maybe_predec (HOST_WIDE_INT size)
+{
+  if (m_explicit_inc >= 0)
+    return;
+  gcc_assert (HAVE_PRE_DECREMENT);
+  increment_address (size);
+}
+
+/* If we are supposed to increment the address after each access, emit code
+   to do so now.  */
+
+void
+pieces_addr::maybe_postinc (HOST_WIDE_INT size)
+{
+  if (m_explicit_inc <= 0)
+    return;
+  gcc_assert (HAVE_POST_INCREMENT);
+  increment_address (size);
+}
+
+/* This structure is used by do_op_by_pieces to describe the operation
+   to be performed.  */
+
+struct op_by_pieces_d
+{
+  pieces_addr m_to, m_from;
+  unsigned HOST_WIDE_INT m_len;
+  HOST_WIDE_INT m_offset;
+  unsigned int m_align;
+  unsigned int m_max_size;
+  bool m_reverse;
+
+  op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
+		  unsigned HOST_WIDE_INT, unsigned int);
+  void iterate (void (*)(rtx, rtx, machine_mode, void *),
+		     machine_mode, void *);
+};
+
+/* The constructor for an op_by_pieces_d structure.  We require two
+   objects named TO and FROM, which are identified as loads or stores
+   by TO_LOAD and FROM_LOAD.  If FROM is a load, the optional FROM_CFN
+   and its associated FROM_CFN_DATA can be used to replace loads with
+   constant values.  LEN describes the length of the operation.  */
+op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
+				rtx from, bool from_load,
+				by_pieces_constfn from_cfn,
+				void *from_cfn_data,
+				unsigned HOST_WIDE_INT len,
+				unsigned int align)
+  : m_to (to, to_load, NULL, NULL),
+    m_from (from, from_load, from_cfn, from_cfn_data),
+    m_len (len), m_max_size (MOVE_MAX_PIECES + 1)
+{
+  int toi = m_to.get_addr_inc ();
+  int fromi = m_from.get_addr_inc ();
+  if (toi >= 0 && fromi >= 0)
+    m_reverse = false;
+  else if (toi <= 0 && fromi <= 0)
+    m_reverse = true;
+  else
+    gcc_unreachable ();
+
+  m_offset = m_reverse ? len : 0;
+  align = MIN (to ? MEM_ALIGN (to) : align, MEM_ALIGN (from));
 
   /* If copying requires more than two move insns,
      copy addresses to registers (to make displacements shorter)
      and use post-increment if available.  */
-  if (!(data.autinc_from && data.autinc_to)
-      && move_by_pieces_ninsns (len, align, max_size) > 2)
+  if (move_by_pieces_ninsns (len, align, m_max_size) > 2)
     {
-      /* Find the mode of the largest move...
-	 MODE might not be used depending on the definitions of the
-	 USE_* macros below.  */
-      machine_mode mode ATTRIBUTE_UNUSED
-	= widest_int_mode_for_size (max_size);
+      /* Find the mode of the largest comparison.  */
+      machine_mode mode = widest_int_mode_for_size (m_max_size);
 
-      if (USE_LOAD_PRE_DECREMENT (mode) && data.reverse && ! data.autinc_from)
-	{
-	  data.from_addr = copy_to_mode_reg (from_addr_mode,
-					     plus_constant (from_addr_mode,
-							    from_addr, len));
-	  data.autinc_from = 1;
-	  data.explicit_inc_from = -1;
-	}
-      if (USE_LOAD_POST_INCREMENT (mode) && ! data.autinc_from)
-	{
-	  data.from_addr = copy_to_mode_reg (from_addr_mode, from_addr);
-	  data.autinc_from = 1;
-	  data.explicit_inc_from = 1;
-	}
-      if (!data.autinc_from && CONSTANT_P (from_addr))
-	data.from_addr = copy_to_mode_reg (from_addr_mode, from_addr);
-      if (USE_STORE_PRE_DECREMENT (mode) && data.reverse && ! data.autinc_to)
-	{
-	  data.to_addr = copy_to_mode_reg (to_addr_mode,
-					   plus_constant (to_addr_mode,
-							  to_addr, len));
-	  data.autinc_to = 1;
-	  data.explicit_inc_to = -1;
-	}
-      if (USE_STORE_POST_INCREMENT (mode) && ! data.reverse && ! data.autinc_to)
-	{
-	  data.to_addr = copy_to_mode_reg (to_addr_mode, to_addr);
-	  data.autinc_to = 1;
-	  data.explicit_inc_to = 1;
-	}
-      if (!data.autinc_to && CONSTANT_P (to_addr))
-	data.to_addr = copy_to_mode_reg (to_addr_mode, to_addr);
+      m_from.decide_autoinc (mode, m_reverse, len);
+      m_to.decide_autoinc (mode, m_reverse, len);
     }
 
   align = alignment_for_piecewise_move (MOVE_MAX_PIECES, align);
+  m_align = align;
+}
+
+/* A callback used when iterating for a compare_by_pieces_operation.
+   OP0 and OP1 are the values that have been loaded and should be
+   compared in MODE.  If OP0 is NULL, this means we should generate a
+   push; otherwise EXTRA_DATA holds a pointer to a pointer to the insn
+   gen function that should be used to generate the mode.  */
+
+static void
+move_by_pieces_genfn (rtx op0, rtx op1, machine_mode mode, void *extra_data)
+{
+  insn_gen_fn genfun = *(insn_gen_fn *)extra_data;
+#ifdef PUSH_ROUNDING
+  if (op0 == NULL_RTX)
+    {
+      emit_single_push_insn (mode, op1, NULL);
+      return;
+    }
+#endif
+  emit_insn (genfun (op0, op1));
+}
+
+/* Generate several move instructions to copy LEN bytes from block FROM to
+   block TO.  (These are MEM rtx's with BLKmode).
+
+   If PUSH_ROUNDING is defined and TO is NULL, emit_single_push_insn is
+   used to push FROM to the stack.
+
+   ALIGN is maximum stack alignment we can assume.
+
+   If ENDP is 0 return to, if ENDP is 1 return memory at the end ala
+   mempcpy, and if ENDP is 2 return memory the end minus one byte ala
+   stpcpy.  */
+
+rtx
+move_by_pieces (rtx to, rtx from, unsigned HOST_WIDE_INT len,
+		unsigned int align, int endp)
+{
+  enum insn_code icode;
+
+#ifndef PUSH_ROUNDING
+  gcc_unreachable ();
+#endif
+
+  op_by_pieces_d data (to, false, from, true, NULL, NULL, len, align);
 
   /* First move what we can in the largest integer mode, then go to
      successively smaller modes.  */
 
-  while (max_size > 1 && data.len > 0)
+  while (data.m_max_size > 1 && data.m_len > 0)
     {
-      machine_mode mode = widest_int_mode_for_size (max_size);
+      machine_mode mode = widest_int_mode_for_size (data.m_max_size);
 
       if (mode == VOIDmode)
 	break;
 
       icode = optab_handler (mov_optab, mode);
-      if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))
-	move_by_pieces_1 (GEN_FCN (icode), mode, &data);
+      insn_gen_fn fn = GEN_FCN (icode);
+      if (icode != CODE_FOR_nothing
+	  && data.m_align >= GET_MODE_ALIGNMENT (mode))
+	data.iterate (move_by_pieces_genfn, mode, (void *)&fn);
 
-      max_size = GET_MODE_SIZE (mode);
+      data.m_max_size = GET_MODE_SIZE (mode);
     }
 
   /* The code above should have handled everything.  */
-  gcc_assert (!data.len);
+  gcc_assert (!data.m_len);
 
   if (endp)
     {
-      rtx to1;
-
-      gcc_assert (!data.reverse);
-      if (data.autinc_to)
-	{
-	  if (endp == 2)
-	    {
-	      if (HAVE_POST_INCREMENT && data.explicit_inc_to > 0)
-		emit_insn (gen_add2_insn (data.to_addr, constm1_rtx));
-	      else
-		data.to_addr = copy_to_mode_reg (to_addr_mode,
-						 plus_constant (to_addr_mode,
-								data.to_addr,
-								-1));
-	    }
-	  to1 = adjust_automodify_address (data.to, QImode, data.to_addr,
-					   data.offset);
-	}
-      else
+      gcc_assert (!data.m_reverse);
+      if (endp == 2)
 	{
-	  if (endp == 2)
-	    --data.offset;
-	  to1 = adjust_address (data.to, QImode, data.offset);
+	  if (data.m_to.m_auto)
+	    data.m_to.increment_address (-1);
+	  --data.m_offset;
 	}
-      return to1;
+      return data.m_to.adjust (QImode, data.m_offset);
     }
   else
-    return data.to;
+    return to;
 }
 
 /* Return number of insns required to move L bytes by pieces.
@@ -974,71 +1120,148 @@ move_by_pieces_ninsns (unsigned HOST_WID
   return n_insns;
 }
 
-/* Subroutine of move_by_pieces.  Move as many bytes as appropriate
-   with move instructions for mode MODE.  GENFUN is the gen_... function
-   to make a move insn for that mode.  DATA has all the other info.  */
+/* Perform one step of a piecewise memory operation, using accesses of MODE
+   until the remaining size is smaller than this mode.  For every iteration,
+   call GENFUN with the two operands and the EXTRA_DATA.  */
 
-static void
-move_by_pieces_1 (insn_gen_fn genfun, machine_mode mode,
-		  struct move_by_pieces_d *data)
+void
+op_by_pieces_d::iterate (void (*genfun)(rtx, rtx, machine_mode, void *),
+			 machine_mode mode, void *extra_data)
 {
   unsigned int size = GET_MODE_SIZE (mode);
   rtx to1 = NULL_RTX, from1;
 
-  while (data->len >= size)
+  while (m_len >= size)
     {
-      if (data->reverse)
-	data->offset -= size;
+      if (m_reverse)
+	m_offset -= size;
 
-      if (data->to)
-	{
-	  if (data->autinc_to)
-	    to1 = adjust_automodify_address (data->to, mode, data->to_addr,
-					     data->offset);
-	  else
-	    to1 = adjust_address (data->to, mode, data->offset);
-	}
+      to1 = m_to.adjust (mode, m_offset);
+      from1 = m_from.adjust (mode, m_offset);
 
-      if (data->autinc_from)
-	from1 = adjust_automodify_address (data->from, mode, data->from_addr,
-					   data->offset);
-      else
-	from1 = adjust_address (data->from, mode, data->offset);
+      m_to.maybe_predec (-(HOST_WIDE_INT)size);
+      m_from.maybe_predec (-(HOST_WIDE_INT)size);
 
-      if (HAVE_PRE_DECREMENT && data->explicit_inc_to < 0)
-	emit_insn (gen_add2_insn (data->to_addr,
-				  gen_int_mode (-(HOST_WIDE_INT) size,
-						GET_MODE (data->to_addr))));
-      if (HAVE_PRE_DECREMENT && data->explicit_inc_from < 0)
-	emit_insn (gen_add2_insn (data->from_addr,
-				  gen_int_mode (-(HOST_WIDE_INT) size,
-						GET_MODE (data->from_addr))));
+      (*genfun) (to1, from1, mode, extra_data);
 
-      if (data->to)
-	emit_insn ((*genfun) (to1, from1));
-      else
-	{
-#ifdef PUSH_ROUNDING
-	  emit_single_push_insn (mode, from1, NULL);
-#else
-	  gcc_unreachable ();
-#endif
-	}
+      m_to.maybe_postinc (size);
+      m_from.maybe_postinc (size);
 
-      if (HAVE_POST_INCREMENT && data->explicit_inc_to > 0)
-	emit_insn (gen_add2_insn (data->to_addr,
-				  gen_int_mode (size,
-						GET_MODE (data->to_addr))));
-      if (HAVE_POST_INCREMENT && data->explicit_inc_from > 0)
-	emit_insn (gen_add2_insn (data->from_addr,
-				  gen_int_mode (size,
-						GET_MODE (data->from_addr))));
+      if (!m_reverse)
+	m_offset += size;
 
-      if (! data->reverse)
-	data->offset += size;
+      m_len -= size;
+    }
+}
 
-      data->len -= size;
+/* Context used by compare_by_pieces_genfn.  It stores the fail label
+   to jump to in case of miscomparison, and for branch ratios greater than 1,
+   it stores an accumulator and the current and maximum counts before
+   emitting another branch.  */
+
+struct compare_by_pieces_data
+{
+  rtx_code_label *fail_label;
+  rtx accumulator;
+  int count, batch;
+};
+
+/* A callback used when iterating for a compare_by_pieces_operation.
+   OP0 and OP1 are the values that have been loaded and should be
+   compared in MODE.  DATA holds a pointer to the compare_by_pieces_data
+   context structure.  */
+
+static void
+compare_by_pieces_genfn (rtx op0, rtx op1, machine_mode mode, void *data)
+{
+  compare_by_pieces_data *extra = (compare_by_pieces_data *)data;
+  if (extra->batch > 1)
+    {
+      rtx temp = expand_binop (mode, sub_optab, op0, op1, NULL_RTX,
+			       true, OPTAB_LIB_WIDEN);
+      if (extra->count != 0)
+	temp = expand_binop (mode, ior_optab, extra->accumulator, temp, temp,
+			     true, OPTAB_LIB_WIDEN);
+      extra->accumulator = temp;
+
+      if (++extra->count < extra->batch)
+	return;
+
+      extra->count = 0;
+      op0 = extra->accumulator;
+      op1 = const0_rtx;
+      extra->accumulator = NULL_RTX;
     }
+  do_compare_rtx_and_jump (op0, op1, NE, true, mode, NULL_RTX, NULL,
+			   extra->fail_label, -1);
+}
+
+/* Generate several move instructions to compare LEN bytes from blocks
+   ARG0 and ARG1.  (These are MEM rtx's with BLKmode).
+
+   If PUSH_ROUNDING is defined and TO is NULL, emit_single_push_insn is
+   used to push FROM to the stack.
+
+   ALIGN is maximum stack alignment we can assume.
+
+   Optionally, the caller can pass a constfn and associated data in A1_CFN
+   and A1_CFN_DATA. describing that the second operand being compared is a
+   known constant and how to obtain its data.  */
+
+static rtx
+compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
+		   rtx target, unsigned int align,
+		   by_pieces_constfn a1_cfn, void *a1_cfn_data)
+{
+  enum insn_code icode;
+  rtx_code_label *fail_label = gen_label_rtx ();
+  rtx_code_label *end_label = gen_label_rtx ();
+
+  compare_by_pieces_data extra;
+  extra.fail_label = fail_label;
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  op_by_pieces_d data (arg0, true, arg1, true, a1_cfn, a1_cfn_data, len,
+		       align);
+
+  /* First move what we can in the largest integer mode, then go to
+     successively smaller modes.  */
+
+  while (data.m_max_size > 1 && data.m_len > 0)
+    {
+      machine_mode mode = widest_int_mode_for_size (data.m_max_size);
+
+      if (mode == VOIDmode)
+	break;
+
+      icode = optab_handler (mov_optab, mode);
+      if (icode != CODE_FOR_nothing
+	  && data.m_align >= GET_MODE_ALIGNMENT (mode))
+	{
+	  extra.batch = targetm.compare_by_pieces_branch_ratio (mode);
+	  extra.accumulator = NULL_RTX;
+	  extra.count = 0;
+	  data.iterate (compare_by_pieces_genfn, mode, (void *)&extra);
+	  if (extra.accumulator != NULL_RTX)
+	    do_compare_rtx_and_jump (extra.accumulator, const0_rtx, NE, true,
+				     mode, NULL_RTX, NULL, extra.fail_label,
+				     -1);
+	}
+
+      data.m_max_size = GET_MODE_SIZE (mode);
+    }
+  emit_move_insn (target, const0_rtx);
+  emit_jump (end_label);
+  emit_barrier ();
+  emit_label (fail_label);
+  emit_move_insn (target, const1_rtx);
+  emit_label (end_label);
+
+  /* The code above should have handled everything.  */
+  gcc_assert (!data.m_len);
+  return target;
 }
 
 /* Emit code to move a block Y to a block X.  This may be done with
@@ -1068,8 +1291,7 @@ emit_block_move_hints (rtx x, rtx y, rtx
   unsigned int align;
 
   gcc_assert (size);
-  if (CONST_INT_P (size)
-      && INTVAL (size) == 0)
+  if (CONST_INT_P (size) && INTVAL (size) == 0)
     return 0;
 
   switch (method)
@@ -1463,6 +1685,116 @@ emit_block_move_via_loop (rtx x, rtx y,
   emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
 			   true, top_label, REG_BR_PROB_BASE * 90 / 100);
 }
+
+/* Try to expand cmpstrn or cmpmem operation ICODE with the given operands.
+   ARG3_TYPE is the type of ARG3_RTX.  Return the result rtx on success,
+   otherwise return null.  */
+
+rtx
+expand_cmpstrn_or_cmpmem (insn_code icode, rtx target, rtx arg1_rtx,
+			  rtx arg2_rtx, tree arg3_type, rtx arg3_rtx,
+			  HOST_WIDE_INT align)
+{
+  machine_mode insn_mode = insn_data[icode].operand[0].mode;
+
+  if (target && (!REG_P (target) || HARD_REGISTER_P (target)))
+    target = NULL_RTX;
+
+  struct expand_operand ops[5];
+  create_output_operand (&ops[0], target, insn_mode);
+  create_fixed_operand (&ops[1], arg1_rtx);
+  create_fixed_operand (&ops[2], arg2_rtx);
+  create_convert_operand_from (&ops[3], arg3_rtx, TYPE_MODE (arg3_type),
+			       TYPE_UNSIGNED (arg3_type));
+  create_integer_operand (&ops[4], align);
+  if (maybe_expand_insn (icode, 5, ops))
+    return ops[0].value;
+  return NULL_RTX;
+}
+
+static rtx
+emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			   unsigned align)
+{
+  /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
+     implementing memcmp because it will stop if it encounters two
+     zero bytes.  */
+  insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
+
+  if (icode == CODE_FOR_nothing)
+    return NULL_RTX;
+
+  return expand_cmpstrn_or_cmpmem (icode, target, x, y,
+				   len_type, len, align);
+}
+
+/* Emit code to compare a block Y to a block X.  This may be done with
+   string-compare instructions, with multiple scalar instructions,
+   or with a library call.
+
+   Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
+   they are.  LEN_TYPE is the type of the expression that was used to
+   calculate it.
+
+   If EQUALITY_ONLY is true, it means we don't have to return the tri-state
+   value of a normal memcmp call, instead we can just compare for equality.
+   If FORCE_LIBCALL is true, we should emit a call to memcmp rather than
+   returning NULL_RTX.
+
+   Optionally, the caller can pass a constfn and associated data in Y_CFN
+   and Y_CFN_DATA. describing that the second operand being compared is a
+   known constant and how to obtain its data.
+   Return the result of the comparison, or NULL_RTX if we failed to
+   perform the operation.  */
+
+rtx
+emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
+		      bool equality_only, bool force_libcall,
+		      by_pieces_constfn y_cfn, void *y_cfndata)
+{
+  rtx result = 0;
+
+  if (CONST_INT_P (len) && INTVAL (len) == 0)
+    return const0_rtx;
+
+  gcc_assert (MEM_P (x) && MEM_P (y));
+  unsigned int align = MIN (MEM_ALIGN (x), MEM_ALIGN (y));
+  gcc_assert (align >= BITS_PER_UNIT);
+
+  x = adjust_address (x, BLKmode, 0);
+  y = adjust_address (y, BLKmode, 0);
+
+  if (equality_only
+      && CONST_INT_P (len)
+      && can_do_by_pieces (INTVAL (len), align, COMPARE_BY_PIECES))
+    result = compare_by_pieces (x, y, INTVAL (len), target, align,
+				y_cfn, y_cfndata);
+  else if ((result = emit_block_cmp_via_cmpmem (x, y, len, len_type,
+						target, align)) != NULL_RTX)
+    ;
+  else if (force_libcall)
+    {
+      gcc_assert (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)));
+      gcc_assert (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (y)));
+
+      result = target;
+      machine_mode result_mode = TYPE_MODE (integer_type_node);
+      if (result == NULL_RTX
+	  || !REG_P (result)
+	  || GET_MODE (result) != result_mode
+	  || REGNO (result) < FIRST_PSEUDO_REGISTER)
+	result = gen_reg_rtx (result_mode);
+
+      machine_mode size_mode = TYPE_MODE (sizetype);
+      len = convert_to_mode (size_mode, len, TYPE_UNSIGNED (sizetype));
+      emit_library_call_value (memcmp_libfunc, result, LCT_PURE,
+			       result_mode, 3,
+			       XEXP (x, 0), Pmode, XEXP (y, 0), Pmode,
+			       len, size_mode);
+    }
+
+  return result;
+}
 
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
Index: gcc/expr.h
===================================================================
--- gcc/expr.h	(revision 235474)
+++ gcc/expr.h	(working copy)
@@ -82,6 +82,8 @@ enum block_op_methods
   BLOCK_OP_TAILCALL
 };
 
+typedef rtx (*by_pieces_constfn) (void *, HOST_WIDE_INT, machine_mode);
+
 extern GTY(()) tree block_clear_fn;
 extern void init_block_move_fn (const char *);
 extern void init_block_clear_fn (const char *);
@@ -93,6 +95,8 @@ extern rtx emit_block_move_hints (rtx, r
 				  unsigned HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT);
+extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool, bool,
+				 by_pieces_constfn, void *);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
@@ -157,6 +161,11 @@ extern void use_regs (rtx *, int, int);
 /* Mark a PARALLEL as holding a parameter for the next CALL_INSN.  */
 extern void use_group_regs (rtx *, rtx);
 
+#ifdef GCC_INSN_CODES_H
+extern rtx expand_cmpstrn_or_cmpmem (insn_code, rtx, rtx, rtx, tree, rtx,
+				     HOST_WIDE_INT);
+#endif
+
 /* Write zeros through the storage of OBJECT.
    If OBJECT has BLKmode, SIZE is its length in bytes.  */
 extern rtx clear_storage (rtx, rtx, enum block_op_methods);
@@ -187,8 +196,7 @@ extern unsigned HOST_WIDE_INT move_by_pi
    MEMSETP is true if this is a real memset/bzero, not a copy
    of a const string.  */
 extern int can_store_by_pieces (unsigned HOST_WIDE_INT,
-				rtx (*) (void *, HOST_WIDE_INT,
-					 machine_mode),
+				by_pieces_constfn,
 				void *, unsigned int, bool);
 
 /* Generate several move instructions to store LEN bytes generated by
@@ -197,8 +205,7 @@ extern int can_store_by_pieces (unsigned
    ALIGN is maximum alignment we can assume.
    MEMSETP is true if this is a real memset/bzero, not a copy.
    Returns TO + LEN.  */
-extern rtx store_by_pieces (rtx, unsigned HOST_WIDE_INT,
-			    rtx (*) (void *, HOST_WIDE_INT, machine_mode),
+extern rtx store_by_pieces (rtx, unsigned HOST_WIDE_INT, by_pieces_constfn,
 			    void *, unsigned int, bool, int);
 
 /* Emit insns to set X from Y.  */
@@ -279,7 +286,7 @@ rtx get_personality_function (tree);
 /* Determine whether the LEN bytes can be moved by using several move
    instructions.  Return nonzero if a call to move_by_pieces should
    succeed.  */
-extern int can_move_by_pieces (unsigned HOST_WIDE_INT, unsigned int);
+extern bool can_move_by_pieces (unsigned HOST_WIDE_INT, unsigned int);
 
 extern unsigned HOST_WIDE_INT highest_pow2_factor (const_tree);
 
Index: gcc/target.def
===================================================================
--- gcc/target.def	(revision 235474)
+++ gcc/target.def	(working copy)
@@ -3397,8 +3397,9 @@ Both @var{size} and @var{alignment} are
 units.\n\
 \n\
 The parameter @var{op} is one of: @code{CLEAR_BY_PIECES},\n\
-@code{MOVE_BY_PIECES}, @code{SET_BY_PIECES}, @code{STORE_BY_PIECES}.\n\
-These describe the type of memory operation under consideration.\n\
+@code{MOVE_BY_PIECES}, @code{SET_BY_PIECES}, @code{STORE_BY_PIECES} or\n\
+@code{COMPARE_BY_PIECES}.  These describe the type of memory operation\n\
+under consideration.\n\
 \n\
 The parameter @var{speed_p} is true if the code is currently being\n\
 optimized for speed rather than size.\n\
@@ -3418,6 +3419,16 @@ move would be greater than that of a lib
  default_use_by_pieces_infrastructure_p)
 
 DEFHOOK
+(compare_by_pieces_branch_ratio,
+ "When expanding a block comparison in MODE, gcc can try to reduce the\n\
+number of branches at the expense of more memory operations.  This hook\n\
+allows the target to override the default choice.  It should return the\n\
+factor by which branches should be reduced over the plain expansion with\n\
+one comparison per @var{mode}-sized piece.",
+ int, (machine_mode mode),
+ default_compare_by_pieces_branch_ratio)
+
+DEFHOOK
 (optab_supported_p,
  "Return true if the optimizers should use optab @var{op} with\n\
 modes @var{mode1} and @var{mode2} for optimization type @var{opt_type}.\n\
Index: gcc/target.h
===================================================================
--- gcc/target.h	(revision 235474)
+++ gcc/target.h	(working copy)
@@ -79,14 +79,18 @@ enum print_switch_type
 };
 
 /* Types of memory operation understood by the "by_pieces" infrastructure.
-   Used by the TARGET_USE_BY_PIECES_INFRASTRUCTURE_P target hook.  */
+   Used by the TARGET_USE_BY_PIECES_INFRASTRUCTURE_P target hook and
+   internally by the functions in expr.c.  */
 
 enum by_pieces_operation
 {
   CLEAR_BY_PIECES,
   MOVE_BY_PIECES,
   SET_BY_PIECES,
-  STORE_BY_PIECES
+  STORE_BY_PIECES,
+  COMPARE_BY_PIECES,
+  /* Used internally only, never passed to the target hook.  */
+  PUSH_BY_PIECES
 };
 
 typedef int (* print_switch_fn_type) (print_switch_type, const char *);
Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c	(revision 235474)
+++ gcc/targhooks.c	(working copy)
@@ -1482,27 +1482,43 @@ default_use_by_pieces_infrastructure_p (
 
   switch (op)
     {
-      case CLEAR_BY_PIECES:
-	max_size = STORE_MAX_PIECES;
-	ratio = CLEAR_RATIO (speed_p);
-	break;
-      case MOVE_BY_PIECES:
-	max_size = MOVE_MAX_PIECES;
-	ratio = get_move_ratio (speed_p);
-	break;
-      case SET_BY_PIECES:
-	max_size = STORE_MAX_PIECES;
-	ratio = SET_RATIO (speed_p);
-	break;
-      case STORE_BY_PIECES:
-	max_size = STORE_MAX_PIECES;
-	ratio = get_move_ratio (speed_p);
-	break;
+    case CLEAR_BY_PIECES:
+      max_size = STORE_MAX_PIECES;
+      ratio = CLEAR_RATIO (speed_p);
+      break;
+    case MOVE_BY_PIECES:
+      max_size = MOVE_MAX_PIECES;
+      ratio = get_move_ratio (speed_p);
+      break;
+    case SET_BY_PIECES:
+      max_size = STORE_MAX_PIECES;
+      ratio = SET_RATIO (speed_p);
+      break;
+    case STORE_BY_PIECES:
+      max_size = STORE_MAX_PIECES;
+      ratio = get_move_ratio (speed_p);
+      break;
+    case COMPARE_BY_PIECES:
+      max_size = COMPARE_MAX_PIECES;
+      ratio = get_move_ratio (speed_p);
+      break;
+    case PUSH_BY_PIECES:
+      gcc_unreachable ();
     }
 
   return move_by_pieces_ninsns (size, alignment, max_size + 1) < ratio;
 }
 
+/* This hook controls code generation for expanding a memcmp operation by
+   pieces.  Return 1 for the normal pattern of compare/jump after each pair
+   of loads, or a higher number to reduce the number of branches.  */
+
+int
+default_compare_by_pieces_branch_ratio (machine_mode)
+{
+  return 2;
+}
+
 bool
 default_profile_before_prologue (void)
 {
Index: gcc/targhooks.h
===================================================================
--- gcc/targhooks.h	(revision 235474)
+++ gcc/targhooks.h	(working copy)
@@ -199,6 +199,7 @@ extern bool default_use_by_pieces_infras
 						    unsigned int,
 						    enum by_pieces_operation,
 						    bool);
+extern int default_compare_by_pieces_branch_ratio (machine_mode);
 
 extern bool default_profile_before_prologue (void);
 extern reg_class_t default_preferred_reload_class (rtx, reg_class_t);
Index: gcc/testsuite/gcc.dg/pr43052-1.c
===================================================================
--- gcc/testsuite/gcc.dg/pr43052-1.c	(revision 0)
+++ gcc/testsuite/gcc.dg/pr43052-1.c	(working copy)
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+/* { dg-final { scan-assembler-not "memcmp" } }
+ */
+#include <string.h>
+struct A { int x; } a, b;
+
+extern char s[], t[];
+
+int foo ()
+{
+  return memcmp (&a, &b, sizeof (struct A)) == 0;
+}
+
+int bar ()
+{
+  return strcmp (s, t);
+}
+
+int baz ()
+{
+  return strncmp (s, t, 32);
+}
Index: gcc/tree-ssa-forwprop.c
===================================================================
--- gcc/tree-ssa-forwprop.c	(revision 235474)
+++ gcc/tree-ssa-forwprop.c	(working copy)
@@ -1435,6 +1435,76 @@ simplify_builtin_call (gimple_stmt_itera
 	    }
 	}
       break;
+
+    case BUILT_IN_MEMCMP:
+      {
+	tree res = gimple_call_lhs (stmt2);
+	tree arg1 = gimple_call_arg (stmt2, 0);
+	tree arg2 = gimple_call_arg (stmt2, 1);
+	tree len = gimple_call_arg (stmt2, 2);
+	unsigned HOST_WIDE_INT leni;
+	unsigned align1, align2;
+	use_operand_p use_p;
+	imm_use_iterator iter;
+
+	if (!res)
+	  return false;
+
+	FOR_EACH_IMM_USE_FAST (use_p, iter, res)
+	  {
+	    gimple *ustmt = USE_STMT (use_p);
+
+	    if (gimple_code (ustmt) == GIMPLE_ASSIGN)
+	      {
+		gassign *asgn = as_a <gassign *> (ustmt);
+		tree_code code = gimple_assign_rhs_code (asgn);
+		if ((code != EQ_EXPR && code != NE_EXPR)
+		    || !integer_zerop (gimple_assign_rhs2 (asgn)))
+		  return false;
+	      }
+	    else if (gimple_code (ustmt) == GIMPLE_COND)
+	      {
+		tree_code code = gimple_cond_code (ustmt);
+		if ((code != EQ_EXPR && code != NE_EXPR)
+		    || !integer_zerop (gimple_cond_rhs (ustmt)))
+		  return false;
+	      }
+	    else
+	      return false;
+	  }
+
+	if (tree_fits_uhwi_p (len)
+	    && (leni = tree_to_uhwi (len)) <= GET_MODE_SIZE (word_mode)
+	    && exact_log2 (leni) != -1
+	    && (align1 = get_pointer_alignment (arg1)) >= leni * BITS_PER_UNIT
+	    && (align2 = get_pointer_alignment (arg2)) >= leni * BITS_PER_UNIT)
+	  {
+	    location_t loc = gimple_location (stmt2);
+	    tree type, off;
+	    type = build_nonstandard_integer_type (leni * BITS_PER_UNIT, 1);
+	    gcc_assert (GET_MODE_SIZE (TYPE_MODE (type)) == leni);
+	    off = build_int_cst
+	     (build_pointer_type_for_mode (char_type_node, ptr_mode, true), 0);
+	    arg1 = build2_loc (loc, MEM_REF, type, arg1, off);
+	    arg2 = build2_loc (loc, MEM_REF, type, arg2, off);
+	    res = fold_convert_loc (loc, TREE_TYPE (res),
+				    fold_build2_loc (loc, NE_EXPR,
+						     boolean_type_node,
+						     arg1, arg2));
+	    gimplify_and_update_call_from_tree (gsi_p, res);
+	    return true;
+	  }
+
+	tree newfn = builtin_decl_explicit (BUILT_IN_MEMCMP_EQ);
+	gcall *repl = gimple_build_call (newfn, 3, arg1, arg2, len);
+	gimple_call_set_lhs (repl, res);
+	gimple_set_location (repl, gimple_location (stmt2));
+	gimple_set_vuse (repl, gimple_vuse (stmt2));
+	gcc_assert (!gimple_vdef (stmt2));
+	gsi_replace (gsi_p, repl, false);
+	return true;
+      }
+
     default:
       break;
     }
Index: gcc/tree.c
===================================================================
--- gcc/tree.c	(revision 235474)
+++ gcc/tree.c	(working copy)
@@ -10576,6 +10576,13 @@ build_common_builtin_nodes (void)
 			BUILT_IN_STACK_RESTORE,
 			"__builtin_stack_restore", ECF_NOTHROW | ECF_LEAF);
 
+  ftype = build_function_type_list (integer_type_node, const_ptr_type_node,
+				    const_ptr_type_node, size_type_node,
+				    NULL_TREE);
+  local_define_builtin ("__builtin_memcmp_eq", ftype, BUILT_IN_MEMCMP_EQ,
+			"__builtin_memcmp_eq",
+			ECF_PURE | ECF_NOTHROW | ECF_LEAF);
+
   /* If there's a possibility that we might use the ARM EABI, build the
     alternate __cxa_end_cleanup node used to resume from C++ and Java.  */
   if (targetm.arm_eabi_unwinder)

Reply via email to