Re: Thoughts on memcmp expansion (PR43052)

2016-05-31 Thread Bernd Schmidt

On 05/13/2016 10:40 PM, Joseph Myers wrote:

On Fri, 13 May 2016, Bernd Schmidt wrote:


Thanks. So, this would seem to suggest that BITS_PER_UNIT in memcmp/memcpy
expansion is more accurate than a plain 8, although pedantically we might want
to use CHAR_TYPE_SIZE? Should I adjust my patch to use the latter or leave
these parts as-is?


I'd say use CHAR_TYPE_SIZE.


Here's the current patch, retested as before. Since I had to make some 
adjustments since the approval to cope with changes such as the removal 
of memcmp_libfunc, I'll leave it for a few days for comments before 
checking it in.



Bernd

Index: gcc/builtins.c
===
--- gcc/builtins.c	(revision 236909)
+++ gcc/builtins.c	(working copy)
@@ -3671,53 +3671,24 @@ expand_cmpstr (insn_code icode, rtx targ
   return NULL_RTX;
 }
 
-/* Try to expand cmpstrn or cmpmem operation ICODE with the given operands.
-   ARG3_TYPE is the type of ARG3_RTX.  Return the result rtx on success,
-   otherwise return null.  */
-
-static rtx
-expand_cmpstrn_or_cmpmem (insn_code icode, rtx target, rtx arg1_rtx,
-			  rtx arg2_rtx, tree arg3_type, rtx arg3_rtx,
-			  HOST_WIDE_INT align)
-{
-  machine_mode insn_mode = insn_data[icode].operand[0].mode;
-
-  if (target && (!REG_P (target) || HARD_REGISTER_P (target)))
-target = NULL_RTX;
-
-  struct expand_operand ops[5];
-  create_output_operand ([0], target, insn_mode);
-  create_fixed_operand ([1], arg1_rtx);
-  create_fixed_operand ([2], arg2_rtx);
-  create_convert_operand_from ([3], arg3_rtx, TYPE_MODE (arg3_type),
-			   TYPE_UNSIGNED (arg3_type));
-  create_integer_operand ([4], align);
-  if (maybe_expand_insn (icode, 5, ops))
-return ops[0].value;
-  return NULL_RTX;
-}
-
 /* Expand expression EXP, which is a call to the memcmp built-in function.
Return NULL_RTX if we failed and the caller should emit a normal call,
-   otherwise try to get the result in TARGET, if convenient.  */
+   otherwise try to get the result in TARGET, if convenient.
+   RESULT_EQ is true if we can relax the returned value to be either zero
+   or nonzero, without caring about the sign.  */
 
 static rtx
-expand_builtin_memcmp (tree exp, rtx target)
+expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
 {
   if (!validate_arglist (exp,
  			 POINTER_TYPE, POINTER_TYPE, INTEGER_TYPE, VOID_TYPE))
 return NULL_RTX;
 
-  /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
- implementing memcmp because it will stop if it encounters two
- zero bytes.  */
-  insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
-  if (icode == CODE_FOR_nothing)
-return NULL_RTX;
-
   tree arg1 = CALL_EXPR_ARG (exp, 0);
   tree arg2 = CALL_EXPR_ARG (exp, 1);
   tree len = CALL_EXPR_ARG (exp, 2);
+  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+  location_t loc = EXPR_LOCATION (exp);
 
   unsigned int arg1_align = get_pointer_alignment (arg1) / BITS_PER_UNIT;
   unsigned int arg2_align = get_pointer_alignment (arg2) / BITS_PER_UNIT;
@@ -3726,22 +3697,38 @@ expand_builtin_memcmp (tree exp, rtx tar
   if (arg1_align == 0 || arg2_align == 0)
 return NULL_RTX;
 
-  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
-  location_t loc = EXPR_LOCATION (exp);
   rtx arg1_rtx = get_memory_rtx (arg1, len);
   rtx arg2_rtx = get_memory_rtx (arg2, len);
-  rtx arg3_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
+  rtx len_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
 
   /* Set MEM_SIZE as appropriate.  */
-  if (CONST_INT_P (arg3_rtx))
+  if (CONST_INT_P (len_rtx))
 {
-  set_mem_size (arg1_rtx, INTVAL (arg3_rtx));
-  set_mem_size (arg2_rtx, INTVAL (arg3_rtx));
+  set_mem_size (arg1_rtx, INTVAL (len_rtx));
+  set_mem_size (arg2_rtx, INTVAL (len_rtx));
 }
 
-  rtx result = expand_cmpstrn_or_cmpmem (icode, target, arg1_rtx, arg2_rtx,
-	 TREE_TYPE (len), arg3_rtx,
-	 MIN (arg1_align, arg2_align));
+  by_pieces_constfn constfn = NULL;
+
+  const char *src_str = c_getstr (arg1);
+  if (src_str == NULL)
+src_str = c_getstr (arg2);
+  else
+std::swap (arg1_rtx, arg2_rtx);
+
+  /* If SRC is a string constant and block move would be done
+ by pieces, we can avoid loading the string from memory
+ and only stored the computed constants.  */
+  if (src_str
+  && CONST_INT_P (len_rtx)
+  && (unsigned HOST_WIDE_INT) INTVAL (len_rtx) <= strlen (src_str) + 1)
+constfn = builtin_memcpy_read_str;
+
+  rtx result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
+ TREE_TYPE (len), target,
+ result_eq, constfn,
+ CONST_CAST (char *, src_str));
+
   if (result)
 {
   /* Return the value in the proper mode for this function.  */
@@ -6073,9 +6060,15 @@ expand_builtin (tree exp, rtx target, rt
 
 case BUILT_IN_BCMP:
 case BUILT_IN_MEMCMP:
-  target = expand_builtin_memcmp (exp, target);
+case BUILT_IN_MEMCMP_EQ:
+  target = 

Re: Thoughts on memcmp expansion (PR43052)

2016-05-31 Thread Bernd Schmidt

On 05/30/2016 11:37 AM, Florian Weimer wrote:


 * Expand __memcmp_eq for small constant sizes with loads and
   comparison, fall back to a memcmp call.


Should we export such a function from glibc?  I expect it's fairly
common.  Computing the tail difference costs a few cycles.


At the moment the patch ensures that no actual memcmp_eq function is 
called, it's either expanded inline or we fall back to memcmp. But 
there's no reason why it couldn't be added under some appropriate name.



Bernd


Re: Thoughts on memcmp expansion (PR43052)

2016-05-30 Thread Florian Weimer

On 01/15/2016 05:58 PM, Bernd Schmidt wrote:


One question Richard posed in the comments: why aren't we optimizing
small constant size memcmps other than size 1 to *s == *q? The reason is
the return value of memcmp, which implies byte-sized operation
(incidentally, the use of SImode in the cmpmem/cmpstr patterns is really
odd). It's possible to work around this, but expansion becomes a little
more tricky (subtract after bswap, maybe).


When I did this (big-endian conversion, wide substract, sign) to the 
tail difference check in glibc's x86_64 memcmp, it was actually a bit 
faster than isolating the differing byte and returning its difference, 
even for non-random data such as encountered during qsort:



 * Expand __memcmp_eq for small constant sizes with loads and
   comparison, fall back to a memcmp call.


Should we export such a function from glibc?  I expect it's fairly 
common.  Computing the tail difference costs a few cycles.


It may also make sense to call a streamlined implementation if you have 
interesting alignment information (for x86_64, that would be at least 16 
on one or both inputs, so it's perhaps not easy to come by).


Florian



Re: Thoughts on memcmp expansion (PR43052)

2016-05-13 Thread Joseph Myers
On Fri, 13 May 2016, Bernd Schmidt wrote:

> Thanks. So, this would seem to suggest that BITS_PER_UNIT in memcmp/memcpy
> expansion is more accurate than a plain 8, although pedantically we might want
> to use CHAR_TYPE_SIZE? Should I adjust my patch to use the latter or leave
> these parts as-is?

I'd say use CHAR_TYPE_SIZE.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: Thoughts on memcmp expansion (PR43052)

2016-05-13 Thread Joseph Myers
On Fri, 13 May 2016, Bernd Schmidt wrote:

> On 05/13/2016 03:07 PM, Richard Biener wrote:
> > On Fri, May 13, 2016 at 3:05 PM, Bernd Schmidt  wrote:
> > > Huh? Can you elaborate?
> > 
> > When you have a builtin taking a size in bytes then a byte is 8 bits,
> > not BITS_PER_UNIT bits.
> 
> That makes no sense to me. I think the definition of a byte depends on the
> machine (hence the term "octet" was coined to be unambiguous). Also, such a
> definition would seem to imply that machines with 10-bit bytes cannot
> implement memcpy or memcmp.
> 
> Joseph, can you clarify the standard's meaning here?

* In C: a byte is the minimal addressable unit; an N-byte object is made 
up of N "unsigned char" objects, with successive addresses in terms of 
incrementing an "unsigned char *" pointer.  A byte is at least 8 bits.

* In GCC, at the level of GNU C APIs on the target, which generally 
includes built-in functions: a byte (on the target) is made of 
CHAR_TYPE_SIZE bits.  In theory this could be more than BITS_PER_UNIT, or 
that could be more than 8, though support for either of those cases would 
be very bit-rotten (and I'm not sure there ever have been targets with 
CHAR_TYPE_SIZE > BITS_PER_UNIT).  Sizes passed to memcpy and memcmp are 
sizes in units of CHAR_TYPE_SIZE bits.

* In GCC, at the RTL level: a byte (on the target) is a QImode object, 
which is made of BITS_PER_UNIT bits.  (HImode is always two bytes, SImode 
four, etc., if those modes exist.)  Support for BITS_PER_UNIT being more 
than 8 is very bit-rotten.

* In GCC, on the host: GCC only supports hosts (and $build) where bytes 
are 8-bit (though writing it as CHAR_BIT makes it clear that this 8 means 
the number of bits in a host byte).

Internal interfaces e.g. representing the contents of strings or other 
memory on the target may not currently be well-defined except when 
BITS_PER_UNIT is 8.  Cf. e.g. 
.  But the above should 
at least give guidance as to whether BITS_PER_UNIT, CHAR_TYPE_SIZE (or 
TYPE_PRECISION (char_type_node), preferred where possible to minimize 
usage of target macros) or CHAR_BIT is logically right in a particular 
place.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: Thoughts on memcmp expansion (PR43052)

2016-05-13 Thread Bernd Schmidt

On 05/13/2016 03:53 PM, Joseph Myers wrote:

* In C: a byte is the minimal addressable unit; an N-byte object is made
up of N "unsigned char" objects, with successive addresses in terms of
incrementing an "unsigned char *" pointer.  A byte is at least 8 bits.

* In GCC, at the level of GNU C APIs on the target, which generally
includes built-in functions: a byte (on the target) is made of
CHAR_TYPE_SIZE bits.  In theory this could be more than BITS_PER_UNIT, or
that could be more than 8, though support for either of those cases would
be very bit-rotten (and I'm not sure there ever have been targets with
CHAR_TYPE_SIZE > BITS_PER_UNIT).  Sizes passed to memcpy and memcmp are
sizes in units of CHAR_TYPE_SIZE bits.

* In GCC, at the RTL level: a byte (on the target) is a QImode object,
which is made of BITS_PER_UNIT bits.  (HImode is always two bytes, SImode
four, etc., if those modes exist.)  Support for BITS_PER_UNIT being more
than 8 is very bit-rotten.

* In GCC, on the host: GCC only supports hosts (and $build) where bytes
are 8-bit (though writing it as CHAR_BIT makes it clear that this 8 means
the number of bits in a host byte).

Internal interfaces e.g. representing the contents of strings or other
memory on the target may not currently be well-defined except when
BITS_PER_UNIT is 8.  Cf. e.g.
.  But the above should
at least give guidance as to whether BITS_PER_UNIT, CHAR_TYPE_SIZE (or
TYPE_PRECISION (char_type_node), preferred where possible to minimize
usage of target macros) or CHAR_BIT is logically right in a particular
place.


Thanks. So, this would seem to suggest that BITS_PER_UNIT in 
memcmp/memcpy expansion is more accurate than a plain 8, although 
pedantically we might want to use CHAR_TYPE_SIZE? Should I adjust my 
patch to use the latter or leave these parts as-is?



Bernd



Re: Thoughts on memcmp expansion (PR43052)

2016-05-13 Thread Bernd Schmidt

On 05/13/2016 03:07 PM, Richard Biener wrote:

On Fri, May 13, 2016 at 3:05 PM, Bernd Schmidt  wrote:

Huh? Can you elaborate?


When you have a builtin taking a size in bytes then a byte is 8 bits,
not BITS_PER_UNIT bits.


That makes no sense to me. I think the definition of a byte depends on 
the machine (hence the term "octet" was coined to be unambiguous). Also, 
such a definition would seem to imply that machines with 10-bit bytes 
cannot implement memcpy or memcmp.


Joseph, can you clarify the standard's meaning here?


Bernd


Re: Thoughts on memcmp expansion (PR43052)

2016-05-13 Thread Richard Biener
On Fri, May 13, 2016 at 3:05 PM, Bernd Schmidt  wrote:
> On 05/13/2016 12:20 PM, Richard Biener wrote:
>>
>> I'm not much of a fan of C++-ification (in this case it makes review
>> harder) but well ...
>
>
> I felt it was a pretty natural way to structure the code to avoid
> duplicating the same logic across more functions, and we might as well use
> the language for such purposes given that we've bothered to switch.
>
>> +  if (tree_fits_uhwi_p (len)
>> +  && (leni = tree_to_uhwi (len)) <= GET_MODE_SIZE (word_mode)
>> +  && exact_log2 (leni) != -1
>> +  && (align1 = get_pointer_alignment (arg1)) >= leni * BITS_PER_UNIT
>> +  && (align2 = get_pointer_alignment (arg2)) >= leni * BITS_PER_UNIT)
>>
>> I think * BITS_PER_UNIT has to be * 8 here as the C standard defines
>> it that way.
>
>
> Huh? Can you elaborate?

When you have a builtin taking a size in bytes then a byte is 8 bits,
not BITS_PER_UNIT bits.

Richard.

> [...]
>>
>> Ok with those changes.
>
>
> Thanks. I won't be reading email for the next two weeks, so I'll be checking
> it in afterwards.
>
>
> Bernd


Re: Thoughts on memcmp expansion (PR43052)

2016-05-13 Thread Bernd Schmidt

On 05/13/2016 12:20 PM, Richard Biener wrote:

I'm not much of a fan of C++-ification (in this case it makes review
harder) but well ...


I felt it was a pretty natural way to structure the code to avoid 
duplicating the same logic across more functions, and we might as well 
use the language for such purposes given that we've bothered to switch.



+  if (tree_fits_uhwi_p (len)
+  && (leni = tree_to_uhwi (len)) <= GET_MODE_SIZE (word_mode)
+  && exact_log2 (leni) != -1
+  && (align1 = get_pointer_alignment (arg1)) >= leni * BITS_PER_UNIT
+  && (align2 = get_pointer_alignment (arg2)) >= leni * BITS_PER_UNIT)

I think * BITS_PER_UNIT has to be * 8 here as the C standard defines
it that way.


Huh? Can you elaborate?

[...]

Ok with those changes.


Thanks. I won't be reading email for the next two weeks, so I'll be 
checking it in afterwards.



Bernd


Re: Thoughts on memcmp expansion (PR43052)

2016-05-13 Thread Richard Biener
On Thu, May 12, 2016 at 7:14 PM, Bernd Schmidt  wrote:
> On 05/02/2016 03:14 PM, Richard Biener wrote:
>
>> I think it fits best in tree-ssa-strlen.c:strlen_optimize_stmt for the
>> moment.
>
>
> I've done this in this version. Note that this apparently means it won't be
> run at -O unlike the previous version.
>
> The code moved to tree-ssa-strlen.c is nearly identical to the previous
> version, with the exception of a new line that copies the contents of
> gimple_call_use_set to the newly constructed call. Without this, I found a
> testcase where we can miss a DSE opportunity since we think memcmp_eq can
> access anything.
>
> In the by_pieces_code, I've gone ahead and progressed the C++ conversion a
> little further by making subclasses of op_by_pieces_d. Now the
> {move,store,compare}_by_pieces functions generally just make an object of
> the appropriate type and call its run method. I've also converted
> store_by_pieces to this infrastructure to eliminate more duplication.
>
> I've verified that if you disable the strlenopt code, you get identical code
> generation before and after the patch on x86_64-linux and arm-eabi. With the
> strlenopt enabled, it finds memcmp optimization opportunities on both
> targets (more on x86 due to !SLOW_UNALIGNED_ACCESS). On arm, there is a
> question mark over whether the autoinc support in the by_pieces code is
> really a good idea, but that's unchanged from before and can be addressed
> later.
> Microbenchmarks on x86 suggest that the new by-pieces expansion is a whole
> lot faster than calling memcmp (about a factor of 3 in my tests).
>
> Bootstrapped and tested on x86_64-linux. The testcase failed at -O (now
> changed to -O2), and there was a failure in vrp47.c which also seems to
> appear in gcc-testresults, so I think it's unrelated. Still, I'll make
> another run against a new baseline. Ok for trunk if that all comes back
> clean?

@@ -6081,9 +6056,16 @@ expand_builtin (tree exp, rtx target, rt

 case BUILT_IN_BCMP:
 case BUILT_IN_MEMCMP:
-  target = expand_builtin_memcmp (exp, target);
+case BUILT_IN_MEMCMP_EQ:
+  target = expand_builtin_memcmp (exp, target, fcode ==
BUILT_IN_MEMCMP_EQ);
   if (target)
return target;
+  if (fcode == BUILT_IN_MEMCMP_EQ)
+   {
+ exp = copy_node (exp);
+ tree newdecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
+ TREE_OPERAND (exp, 1) = build_fold_addr_expr (newdecl);
+   }

you can modify 'exp' in-place (yeah, we still build a GENERIC call from GIMPLE
just because nobody "fixed" downstream routines to accept a GIMPLE call).

I'm not much of a fan of C++-ification (in this case it makes review
harder) but well ...

+  if (tree_fits_uhwi_p (len)
+  && (leni = tree_to_uhwi (len)) <= GET_MODE_SIZE (word_mode)
+  && exact_log2 (leni) != -1
+  && (align1 = get_pointer_alignment (arg1)) >= leni * BITS_PER_UNIT
+  && (align2 = get_pointer_alignment (arg2)) >= leni * BITS_PER_UNIT)

I think * BITS_PER_UNIT has to be * 8 here as the C standard defines
it that way.

I think to increase coverage you want || ! SLOW_UNALIGNED_ACCESS
(TYPE_MODE (), align1)
here and build properly mis-aligned access types for the MEM_REF like

  if (src_align < GET_MODE_ALIGNMENT (TYPE_MODE (type)))
srctype = build_aligned_type (type, src_align);

(copied from gimple_fold_builtin_memory_op).

+  type = build_nonstandard_integer_type (leni * BITS_PER_UNIT, 1);
+  gcc_assert (GET_MODE_SIZE (TYPE_MODE (type)) == leni);

Likewise leni * 8 and the assert would have to assert
GET_MODE_SIZE () * BITS_PER_UNIT == leni * 8 (or using GET_MODE_BITSIZE).

+  arg1 = build2_loc (loc, MEM_REF, type, arg1, off);
+  arg2 = build2_loc (loc, MEM_REF, type, arg2, off);

you want to add

tree tem = fold_const_aggregate_ref (arg1);
if (tem)
  arg1 = tem;

and same for arg2.  That avoids leaving unfolded memory refs in the IL
after the transform.

+  tree newfn = builtin_decl_explicit (BUILT_IN_MEMCMP_EQ);
+  gcall *repl = gimple_build_call (newfn, 3, arg1, arg2, len);
+  gimple_call_set_lhs (repl, res);
+  gimple_set_location (repl, gimple_location (stmt2));
+  gimple_set_vuse (repl, gimple_vuse (stmt2));
+  *gimple_call_use_set (repl) = *gimple_call_use_set (stmt2);
+  gcc_assert (!gimple_vdef (stmt2));
+  gsi_replace (gsi, repl, false);
+  return false;

I think you can avoid all this by just doing

   gimple_call_set_fndecl (stmt2, builtin_decl_explicit (BUILT_IN_MEMCMP_EQ));

as MEMCMP and MEMCMP_EQ have equal signatures.

Ok with those changes.

Thanks,
Richard.


>
> Bernd


Re: Thoughts on memcmp expansion (PR43052)

2016-05-12 Thread Bernd Schmidt

On 05/02/2016 03:14 PM, Richard Biener wrote:


I think it fits best in tree-ssa-strlen.c:strlen_optimize_stmt for the moment.


I've done this in this version. Note that this apparently means it won't 
be run at -O unlike the previous version.


The code moved to tree-ssa-strlen.c is nearly identical to the previous 
version, with the exception of a new line that copies the contents of 
gimple_call_use_set to the newly constructed call. Without this, I found 
a testcase where we can miss a DSE opportunity since we think memcmp_eq 
can access anything.


In the by_pieces_code, I've gone ahead and progressed the C++ conversion 
a little further by making subclasses of op_by_pieces_d. Now the 
{move,store,compare}_by_pieces functions generally just make an object 
of the appropriate type and call its run method. I've also converted 
store_by_pieces to this infrastructure to eliminate more duplication.


I've verified that if you disable the strlenopt code, you get identical 
code generation before and after the patch on x86_64-linux and arm-eabi. 
With the strlenopt enabled, it finds memcmp optimization opportunities 
on both targets (more on x86 due to !SLOW_UNALIGNED_ACCESS). On arm, 
there is a question mark over whether the autoinc support in the 
by_pieces code is really a good idea, but that's unchanged from before 
and can be addressed later.
Microbenchmarks on x86 suggest that the new by-pieces expansion is a 
whole lot faster than calling memcmp (about a factor of 3 in my tests).


Bootstrapped and tested on x86_64-linux. The testcase failed at -O (now 
changed to -O2), and there was a failure in vrp47.c which also seems to 
appear in gcc-testresults, so I think it's unrelated. Still, I'll make 
another run against a new baseline. Ok for trunk if that all comes back 
clean?



Bernd
	PR tree-optimization/52171
	* builtins.c (expand_cmpstrn_or_cmpmem): Delete, moved elsewhere.
	(expand_builtin_memcmp): New arg RESULT_EQ.  All callers changed.
	Look for constant strings.  Move some code to emit_block_cmp_hints
	and use it.
	* builtins.def (BUILT_IN_MEMCMP_EQ): New.
	* defaults.h (COMPARE_MAX_PIECES): New macro.
	* expr.c (move_by_pieces_d, store_by_pieces_d): Remove old structs.
	(move_by_pieces_1, store_by_pieces_1, store_by_pieces_2): Remvoe.
	(clear_by_pieces_1): Don't declare.  Move definition before use.
	(can_do_by_pieces): New static function.
	(can_move_by_pieces): Use it.  Return bool.
	(by_pieces_ninsns): Renamed from move_by_pieces_ninsns.  New arg
	OP.  All callers changed.  Handle COMPARE_BY_PIECES.
	(class pieces_addr); New.
	(pieces_addr::pieces_addr, pieces_addr::decide_autoinc,
	pieces_addr::adjust, pieces_addr::increment_address,
	pieces_addr::maybe_predec, pieces_addr::maybe_postinc): New member
	functions for it.
	(class op_by_pieces_d): New.
	(op_by_pieces_d::op_by_pieces_d, op_by_pieces_d::run): New member
	functions for it.
	(class move_by_pieces_d, class compare_by_pieces_d,
	class store_by_pieces_d): New subclasses of op_by_pieces_d.
	(move_by_pieces_d::prepare_mode, move_by_pieces_d::generate,
	move_by_pieces_d::finish_endp, store_by_pieces_d::prepare_mode,
	store_by_pieces_d::generate, store_by_pieces_d::finish_endp,
	compare_by_pieces_d::generate, compare_by_pieces_d::prepare_mode,
	compare_by_pieces_d::finish_mode): New member functions.
	(compare_by_pieces, emit_block_cmp_via_cmpmem): New static
	functions.
	(expand_cmpstrn_or_cmpmem): Moved here from builtins.c.
	(emit_block_cmp_hints): New function.
	(move_by_pieces, store_by_pieces, clear_by_pieces): Rewrite to just
	use the newly defined classes.
	* expr.h (by_pieces_constfn): New typedef.
	(can_store_by_pieces, store_by_pieces): Use it in arg declarations.
	(emit_block_cmp_hints, expand_cmpstrn_or_cmpmem): Declare.
	(move_by_pieces_ninsns): Don't declare.
	(can_move_by_pieces): Change return value to bool.
	* target.def (TARGET_USE_BY_PIECES_INFRASTRUCTURE_P): Update docs.
	(compare_by_pieces_branch_ratio): New hook.
	* target.h (enum by_pieces_operation): Add COMPARE_BY_PIECES.
	(by_pieces_ninsns): Declare.
	* targethooks.c (default_use_by_pieces_infrastructure_p): Handle
	COMPARE_BY_PIECES.
	(default_compare_by_pieces_branch_ratio): New function.
	* targhooks.h (default_compare_by_pieces_branch_ratio): Declare.
	* doc/tm.texi.in (STORE_MAX_PIECES, COMPARE_MAX_PIECES): Document.
	* doc/tm.texi: Regenerate.
	* tree-ssa-strlen.c: Include "builtins.h".
	(handle_builtin_memcmp): New static function.
	(strlen_optimize_stmt): Call it for BUILT_IN_MEMCMP.
	* tree.c (build_common_builtin_nodes): Create __builtin_memcmp_eq.

testsuite/
	PR tree-optimization/52171
	* gcc.dg/pr52171.c: New test.
	* gcc.target/i386/pr52171.c: New test.

Index: gcc/builtins.c
===
--- gcc/builtins.c	(revision 236113)
+++ gcc/builtins.c	(working copy)
@@ -3671,53 +3671,24 @@ expand_cmpstr (insn_code icode, rtx targ
   return NULL_RTX;
 }
 
-/* Try to expand cmpstrn or cmpmem 

Re: Thoughts on memcmp expansion (PR43052)

2016-05-02 Thread Richard Biener
On Mon, May 2, 2016 at 2:57 PM, Bernd Schmidt  wrote:
> On 05/02/2016 02:52 PM, Richard Biener wrote:
>>
>> +struct pieces_addr
>> +{
>> ...
>> +  void *m_cfndata;
>> +public:
>> +  pieces_addr (rtx, bool, by_pieces_constfn, void *);
>>
>> unless you strick private: somewhere the public: is redundant
>
>
> Yeah, ideally I want to turn these into a classes rather than structs. Maybe
> that particular one can already be done, but I'm kind of wondering how far
> to take the C++ification of the other one.
>
>> Index: gcc/tree-ssa-forwprop.c
>> ===
>> --- gcc/tree-ssa-forwprop.c (revision 235474)
>> +++ gcc/tree-ssa-forwprop.c (working copy)
>> @@ -1435,6 +1435,76 @@ simplify_builtin_call (gimple_stmt_itera
>>  }
>>  }
>> break;
>> +
>> +case BUILT_IN_MEMCMP:
>> +  {
>>
>> I think this doesn't belong in forwprop.  If we want to stick it into
>> a pass rather than
>> folding it should be in tree-ssa-strlen.c.
>
>
> This part (and the other one you quoted) was essentially your prototype
> patch from PR52171. I can put it whereever you like, really.

I think it fits best in tree-ssa-strlen.c:strlen_optimize_stmt for the moment.

>> Note that we can handle size-1 memcmp even for ordered compares.
>
>
> One would hope this doesn't occur very often...

C++ templates  but yes.

Richard.

>
>> Jakub, where do you think this fits best?  Note that gimple-fold.c may
>> not use immediate uses but would have to match this from the
>> comparison (I still have to find a way to handle this in match.pd where
>> the result expression contains virtual operands in the not toplevel stmt).
>
>
>
> Bernd


Re: Thoughts on memcmp expansion (PR43052)

2016-05-02 Thread Bernd Schmidt

On 05/02/2016 02:52 PM, Richard Biener wrote:

+struct pieces_addr
+{
...
+  void *m_cfndata;
+public:
+  pieces_addr (rtx, bool, by_pieces_constfn, void *);

unless you strick private: somewhere the public: is redundant


Yeah, ideally I want to turn these into a classes rather than structs. 
Maybe that particular one can already be done, but I'm kind of wondering 
how far to take the C++ification of the other one.



Index: gcc/tree-ssa-forwprop.c
===
--- gcc/tree-ssa-forwprop.c (revision 235474)
+++ gcc/tree-ssa-forwprop.c (working copy)
@@ -1435,6 +1435,76 @@ simplify_builtin_call (gimple_stmt_itera
 }
 }
break;
+
+case BUILT_IN_MEMCMP:
+  {

I think this doesn't belong in forwprop.  If we want to stick it into
a pass rather than
folding it should be in tree-ssa-strlen.c.


This part (and the other one you quoted) was essentially your prototype 
patch from PR52171. I can put it whereever you like, really.



Note that we can handle size-1 memcmp even for ordered compares.


One would hope this doesn't occur very often...


Jakub, where do you think this fits best?  Note that gimple-fold.c may
not use immediate uses but would have to match this from the
comparison (I still have to find a way to handle this in match.pd where
the result expression contains virtual operands in the not toplevel stmt).



Bernd


Re: Thoughts on memcmp expansion (PR43052)

2016-05-02 Thread Richard Biener
On Thu, Apr 28, 2016 at 8:36 PM, Bernd Schmidt  wrote:
> On 01/18/2016 10:22 AM, Richard Biener wrote:
>>
>> See also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52171 - the
>> inline expansion
>> for small sizes and equality compares should be done on GIMPLE.  Today the
>> strlen pass might be an appropriate place to do this given its
>> superior knowledge
>> about string lengths.
>>
>> The idea of turning eq feeding memcmp into a special memcmp_eq is good but
>> you have to avoid doing that too early - otherwise you'd lose on
>>
>>res = memcmp (p, q, sz);
>>if (memcmp (p, q, sz) == 0)
>> ...
>>
>> that is, you have to make sure CSE got the chance to common the two calls.
>> This is why I think this kind of transform needs to happen in specific
>> places
>> (like during strlen opt) rather than in generic folding.
>
>
> Ok, here's an update. I kept pieces of your patch from that PR, but also
> translating memcmps larger than a single operation into memcmp_eq as in my
> previous patch.
>
> Then, I added by_pieces infrastructure for memcmp expansion. To avoid any
> more code duplication in this area, I abstracted the existing code and
> converted it to C++ classes since that seemed to fit pretty well.
>
> There are a few possible ways I could go with this, which is why I'm posting
> it more as a RFD at this point.
>  - should store_by_pieces be eliminated in favour of doing just
>move_by_pieces with constfns?
>  - the C++ification could be extended, making move_by_pieces_d and
>compare_by_pieces_d classes inheriting from a common base. This
>would get rid of the callbacks, replacing them with virtuals,
>and also make some of the current struct members private.
>  - could move all of the by_pieces stuff out into a separate file?
>
> Later, I think we'll also want to extend this to allow vector mode
> operations, but I think that's a separate patch.
>
> So, opinions what I should be doing with this patch? FWIW it bootstraps and
> tests OK on x86_64-linux.

+struct pieces_addr
+{
...
+  void *m_cfndata;
+public:
+  pieces_addr (rtx, bool, by_pieces_constfn, void *);

unless you strick private: somewhere the public: is redundant

Index: gcc/tree-ssa-forwprop.c
===
--- gcc/tree-ssa-forwprop.c (revision 235474)
+++ gcc/tree-ssa-forwprop.c (working copy)
@@ -1435,6 +1435,76 @@ simplify_builtin_call (gimple_stmt_itera
}
}
   break;
+
+case BUILT_IN_MEMCMP:
+  {

I think this doesn't belong in forwprop.  If we want to stick it into
a pass rather than
folding it should be in tree-ssa-strlen.c.

+   if (tree_fits_uhwi_p (len)
+   && (leni = tree_to_uhwi (len)) <= GET_MODE_SIZE (word_mode)
+   && exact_log2 (leni) != -1
+   && (align1 = get_pointer_alignment (arg1)) >= leni * BITS_PER_UNIT
+   && (align2 = get_pointer_alignment (arg2)) >= leni * BITS_PER_UNIT)
+ {
+   location_t loc = gimple_location (stmt2);
+   tree type, off;
+   type = build_nonstandard_integer_type (leni * BITS_PER_UNIT, 1);
+   gcc_assert (GET_MODE_SIZE (TYPE_MODE (type)) == leni);
+   off = build_int_cst
+(build_pointer_type_for_mode (char_type_node, ptr_mode, true), 0);
+   arg1 = build2_loc (loc, MEM_REF, type, arg1, off);
+   arg2 = build2_loc (loc, MEM_REF, type, arg2, off);
+   res = fold_convert_loc (loc, TREE_TYPE (res),
+   fold_build2_loc (loc, NE_EXPR,
+boolean_type_node,
+arg1, arg2));
+   gimplify_and_update_call_from_tree (gsi_p, res);
+   return true;
+ }

for this part see gimple_fold_builtin_memory_op handling of

  /* If we can perform the copy efficiently with first doing all loads
 and then all stores inline it that way.  Currently efficiently
 means that we can load all the memory into a single integer
 register which is what MOVE_MAX gives us.  */

we might want to share a helper yielding the type of the load/store
or NULL if not possible.

Note that we can handle size-1 memcmp even for ordered compares.

Jakub, where do you think this fits best?  Note that gimple-fold.c may
not use immediate uses but would have to match this from the
comparison (I still have to find a way to handle this in match.pd where
the result expression contains virtual operands in the not toplevel stmt).

Richard.

>
> Bernd


Re: Thoughts on memcmp expansion (PR43052)

2016-04-28 Thread Bernd Schmidt

On 01/18/2016 10:22 AM, Richard Biener wrote:

See also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52171 - the
inline expansion
for small sizes and equality compares should be done on GIMPLE.  Today the
strlen pass might be an appropriate place to do this given its
superior knowledge
about string lengths.

The idea of turning eq feeding memcmp into a special memcmp_eq is good but
you have to avoid doing that too early - otherwise you'd lose on

   res = memcmp (p, q, sz);
   if (memcmp (p, q, sz) == 0)
...

that is, you have to make sure CSE got the chance to common the two calls.
This is why I think this kind of transform needs to happen in specific places
(like during strlen opt) rather than in generic folding.


Ok, here's an update. I kept pieces of your patch from that PR, but also 
translating memcmps larger than a single operation into memcmp_eq as in 
my previous patch.


Then, I added by_pieces infrastructure for memcmp expansion. To avoid 
any more code duplication in this area, I abstracted the existing code 
and converted it to C++ classes since that seemed to fit pretty well.


There are a few possible ways I could go with this, which is why I'm 
posting it more as a RFD at this point.

 - should store_by_pieces be eliminated in favour of doing just
   move_by_pieces with constfns?
 - the C++ification could be extended, making move_by_pieces_d and
   compare_by_pieces_d classes inheriting from a common base. This
   would get rid of the callbacks, replacing them with virtuals,
   and also make some of the current struct members private.
 - could move all of the by_pieces stuff out into a separate file?

Later, I think we'll also want to extend this to allow vector mode 
operations, but I think that's a separate patch.


So, opinions what I should be doing with this patch? FWIW it bootstraps 
and tests OK on x86_64-linux.



Bernd
Index: gcc/builtins.c
===
--- gcc/builtins.c	(revision 235474)
+++ gcc/builtins.c	(working copy)
@@ -3671,53 +3671,24 @@ expand_cmpstr (insn_code icode, rtx targ
   return NULL_RTX;
 }
 
-/* Try to expand cmpstrn or cmpmem operation ICODE with the given operands.
-   ARG3_TYPE is the type of ARG3_RTX.  Return the result rtx on success,
-   otherwise return null.  */
-
-static rtx
-expand_cmpstrn_or_cmpmem (insn_code icode, rtx target, rtx arg1_rtx,
-			  rtx arg2_rtx, tree arg3_type, rtx arg3_rtx,
-			  HOST_WIDE_INT align)
-{
-  machine_mode insn_mode = insn_data[icode].operand[0].mode;
-
-  if (target && (!REG_P (target) || HARD_REGISTER_P (target)))
-target = NULL_RTX;
-
-  struct expand_operand ops[5];
-  create_output_operand ([0], target, insn_mode);
-  create_fixed_operand ([1], arg1_rtx);
-  create_fixed_operand ([2], arg2_rtx);
-  create_convert_operand_from ([3], arg3_rtx, TYPE_MODE (arg3_type),
-			   TYPE_UNSIGNED (arg3_type));
-  create_integer_operand ([4], align);
-  if (maybe_expand_insn (icode, 5, ops))
-return ops[0].value;
-  return NULL_RTX;
-}
-
 /* Expand expression EXP, which is a call to the memcmp built-in function.
Return NULL_RTX if we failed and the caller should emit a normal call,
-   otherwise try to get the result in TARGET, if convenient.  */
+   otherwise try to get the result in TARGET, if convenient.
+   RESULT_EQ is true if we can relax the returned value to be either zero
+   or nonzero, without caring about the sign.  */
 
 static rtx
-expand_builtin_memcmp (tree exp, rtx target)
+expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
 {
   if (!validate_arglist (exp,
  			 POINTER_TYPE, POINTER_TYPE, INTEGER_TYPE, VOID_TYPE))
 return NULL_RTX;
 
-  /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
- implementing memcmp because it will stop if it encounters two
- zero bytes.  */
-  insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
-  if (icode == CODE_FOR_nothing)
-return NULL_RTX;
-
   tree arg1 = CALL_EXPR_ARG (exp, 0);
   tree arg2 = CALL_EXPR_ARG (exp, 1);
   tree len = CALL_EXPR_ARG (exp, 2);
+  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+  location_t loc = EXPR_LOCATION (exp);
 
   unsigned int arg1_align = get_pointer_alignment (arg1) / BITS_PER_UNIT;
   unsigned int arg2_align = get_pointer_alignment (arg2) / BITS_PER_UNIT;
@@ -3726,22 +3697,39 @@ expand_builtin_memcmp (tree exp, rtx tar
   if (arg1_align == 0 || arg2_align == 0)
 return NULL_RTX;
 
-  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
-  location_t loc = EXPR_LOCATION (exp);
   rtx arg1_rtx = get_memory_rtx (arg1, len);
   rtx arg2_rtx = get_memory_rtx (arg2, len);
-  rtx arg3_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
+  rtx len_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
 
   /* Set MEM_SIZE as appropriate.  */
-  if (CONST_INT_P (arg3_rtx))
+  if (CONST_INT_P (len_rtx))
 {
-  set_mem_size (arg1_rtx, INTVAL (arg3_rtx));
-  set_mem_size (arg2_rtx, INTVAL 

Re: Thoughts on memcmp expansion (PR43052)

2016-01-19 Thread Jeff Law

On 01/15/2016 09:58 AM, Bernd Schmidt wrote:

PR43052 is a PR complaining about how the rep cmpsb expansion that gcc
uses for memcmp is slower than the library function. As is so often the
case, if you investigate a bit, you can find a lot of issues with the
current situation in the compiler.

This PR was accidentally fixed by a patch by Nick which disabled the use
of cmpstrnsi for memcmp expansion, on the grounds that cmpstrnsi could
stop looking after seeing a null byte, which would be invalid for
memcmp, so only cmpmemsi should be used. This fix was for an out-of-tree
target.

I believe the rep cmpsb sequence used by i386 would actually be valid,
so we could duplicate the cmpstrn pattern to also match cmpmem and be
done - but that would then again cause the performance problem described
in the PR, so it's probably not a good idea.

One question Richard posed in the comments: why aren't we optimizing
small constant size memcmps other than size 1 to *s == *q? The reason is
the return value of memcmp, which implies byte-sized operation
(incidentally, the use of SImode in the cmpmem/cmpstr patterns is really
odd). It's possible to work around this, but expansion becomes a little
more tricky (subtract after bswap, maybe). Still, the current code
generation is lame.

So, for gcc-6, I think we shouldn't do anything. The PR is fixed, and
there's no easy bug-fix that can be done to improve matters. Not sure
whether to keep the PR open or create a new one for the remaining
issues. For the next stage1, I'm attaching a proof-of-concept patch that
does the following:
  * notice if memcmp results are only used for equality comparison
against zero
  * if so, replace with a different builtin __memcmp_eq
  * Expand __memcmp_eq for small constant sizes with loads and
comparison, fall back to a memcmp call.

The whole thing could be extended to work for sizes larger than an int,
along the lines of memcpy expansion controlled by move ratio etc. Thoughts?
Seems generally reasonable to me.  I've looked at that BZ more than once 
and thought "ewww, I don't want to touch this mess", so I'm more than 
happy to have you taking the lead on it.


Jeff


Re: Thoughts on memcmp expansion (PR43052)

2016-01-18 Thread Richard Biener
On Fri, Jan 15, 2016 at 5:58 PM, Bernd Schmidt  wrote:
> PR43052 is a PR complaining about how the rep cmpsb expansion that gcc uses
> for memcmp is slower than the library function. As is so often the case, if
> you investigate a bit, you can find a lot of issues with the current
> situation in the compiler.
>
> This PR was accidentally fixed by a patch by Nick which disabled the use of
> cmpstrnsi for memcmp expansion, on the grounds that cmpstrnsi could stop
> looking after seeing a null byte, which would be invalid for memcmp, so only
> cmpmemsi should be used. This fix was for an out-of-tree target.
>
> I believe the rep cmpsb sequence used by i386 would actually be valid, so we
> could duplicate the cmpstrn pattern to also match cmpmem and be done - but
> that would then again cause the performance problem described in the PR, so
> it's probably not a good idea.
>
> One question Richard posed in the comments: why aren't we optimizing small
> constant size memcmps other than size 1 to *s == *q? The reason is the
> return value of memcmp, which implies byte-sized operation (incidentally,
> the use of SImode in the cmpmem/cmpstr patterns is really odd). It's
> possible to work around this, but expansion becomes a little more tricky
> (subtract after bswap, maybe). Still, the current code generation is lame.
>
> So, for gcc-6, I think we shouldn't do anything. The PR is fixed, and
> there's no easy bug-fix that can be done to improve matters. Not sure
> whether to keep the PR open or create a new one for the remaining issues.
> For the next stage1, I'm attaching a proof-of-concept patch that does the
> following:
>  * notice if memcmp results are only used for equality comparison
>against zero
>  * if so, replace with a different builtin __memcmp_eq
>  * Expand __memcmp_eq for small constant sizes with loads and
>comparison, fall back to a memcmp call.
>
> The whole thing could be extended to work for sizes larger than an int,
> along the lines of memcpy expansion controlled by move ratio etc. Thoughts?

See also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52171 - the
inline expansion
for small sizes and equality compares should be done on GIMPLE.  Today the
strlen pass might be an appropriate place to do this given its
superior knowledge
about string lengths.

The idea of turning eq feeding memcmp into a special memcmp_eq is good but
you have to avoid doing that too early - otherwise you'd lose on

  res = memcmp (p, q, sz);
  if (memcmp (p, q, sz) == 0)
   ...

that is, you have to make sure CSE got the chance to common the two calls.
This is why I think this kind of transform needs to happen in specific places
(like during strlen opt) rather than in generic folding.

Richard.

>
> Bernd


Re: Thoughts on memcmp expansion (PR43052)

2016-01-18 Thread Nick Clifton

Hi Bernd,

+  rtx op0 = force_reg (direct_mode, arg1_rtx);
+  rtx op1 = force_reg (direct_mode, arg2_rtx);
+  rtx tem = emit_store_flag (target, NE, op0, op1,
+direct_mode, true, false);

This is me being ignorant here... wouldn't it be easier to have a new 
cmpmem_eq pattern (and resulting optab) than to generate this code 
sequence directly ?  That way backends can choose to support this 
optimization, and if they do, they can also choose to support longer 
lengths of comparison.



 DEF_LIB_BUILTIN(BUILT_IN_MEMCMP, "memcmp", 
BT_FN_INT_CONST_PTR_CONST_PTR_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
+DEF_GCC_BUILTIN(BUILT_IN_MEMCMP_EQ, "__memcmp_eq", 
BT_FN_INT_CONST_PTR_CONST_PTR_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
 DEF_LIB_BUILTIN_CHKP   (BUILT_IN_MEMCPY, "memcpy", 
BT_FN_PTR_PTR_CONST_PTR_SIZE, ATTR_RET1_NOTHROW_NONNULL_LEAF)


Presumably you would also document this new builtin in doc/extend.texi ? 
 Plus maybe add a testcase for it as well ?



+  /* If the return value is used, don't do the transformation.  */

This comment struck me as wrong.  Surely if the return value is not used 
then the entire memcmp can be transformed into nothing.  Plus if the 
return value is used, but only for an equality comparison with zero then 
the transformation can take place.



Cheers
  Nick


Thoughts on memcmp expansion (PR43052)

2016-01-15 Thread Bernd Schmidt
PR43052 is a PR complaining about how the rep cmpsb expansion that gcc 
uses for memcmp is slower than the library function. As is so often the 
case, if you investigate a bit, you can find a lot of issues with the 
current situation in the compiler.


This PR was accidentally fixed by a patch by Nick which disabled the use 
of cmpstrnsi for memcmp expansion, on the grounds that cmpstrnsi could 
stop looking after seeing a null byte, which would be invalid for 
memcmp, so only cmpmemsi should be used. This fix was for an out-of-tree 
target.


I believe the rep cmpsb sequence used by i386 would actually be valid, 
so we could duplicate the cmpstrn pattern to also match cmpmem and be 
done - but that would then again cause the performance problem described 
in the PR, so it's probably not a good idea.


One question Richard posed in the comments: why aren't we optimizing 
small constant size memcmps other than size 1 to *s == *q? The reason is 
the return value of memcmp, which implies byte-sized operation 
(incidentally, the use of SImode in the cmpmem/cmpstr patterns is really 
odd). It's possible to work around this, but expansion becomes a little 
more tricky (subtract after bswap, maybe). Still, the current code 
generation is lame.


So, for gcc-6, I think we shouldn't do anything. The PR is fixed, and 
there's no easy bug-fix that can be done to improve matters. Not sure 
whether to keep the PR open or create a new one for the remaining 
issues. For the next stage1, I'm attaching a proof-of-concept patch that 
does the following:

 * notice if memcmp results are only used for equality comparison
   against zero
 * if so, replace with a different builtin __memcmp_eq
 * Expand __memcmp_eq for small constant sizes with loads and
   comparison, fall back to a memcmp call.

The whole thing could be extended to work for sizes larger than an int, 
along the lines of memcpy expansion controlled by move ratio etc. Thoughts?



Bernd
Index: gcc/builtins.c
===
--- gcc/builtins.c	(revision 232359)
+++ gcc/builtins.c	(working copy)
@@ -3699,25 +3699,22 @@ expand_cmpstrn_or_cmpmem (insn_code icod
 
 /* Expand expression EXP, which is a call to the memcmp built-in function.
Return NULL_RTX if we failed and the caller should emit a normal call,
-   otherwise try to get the result in TARGET, if convenient.  */
+   otherwise try to get the result in TARGET, if convenient.
+   RESULT_EQ is true if we can relax the returned value to be either zero
+   or nonzero, without caring about the sign.  */
 
 static rtx
-expand_builtin_memcmp (tree exp, rtx target)
+expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
 {
   if (!validate_arglist (exp,
  			 POINTER_TYPE, POINTER_TYPE, INTEGER_TYPE, VOID_TYPE))
 return NULL_RTX;
 
-  /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
- implementing memcmp because it will stop if it encounters two
- zero bytes.  */
-  insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
-  if (icode == CODE_FOR_nothing)
-return NULL_RTX;
-
   tree arg1 = CALL_EXPR_ARG (exp, 0);
   tree arg2 = CALL_EXPR_ARG (exp, 1);
   tree len = CALL_EXPR_ARG (exp, 2);
+  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+  location_t loc = EXPR_LOCATION (exp);
 
   unsigned int arg1_align = get_pointer_alignment (arg1) / BITS_PER_UNIT;
   unsigned int arg2_align = get_pointer_alignment (arg2) / BITS_PER_UNIT;
@@ -3725,12 +3722,27 @@ expand_builtin_memcmp (tree exp, rtx tar
   /* If we don't have POINTER_TYPE, call the function.  */
   if (arg1_align == 0 || arg2_align == 0)
 return NULL_RTX;
+  unsigned int min_align = MIN (arg1_align, arg2_align);
+
+  /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
+ implementing memcmp because it will stop if it encounters two
+ zero bytes.  */
+  insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
+
+  rtx arg3_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
+  machine_mode direct_mode = VOIDmode;
+  if (CONST_INT_P (arg3_rtx))
+direct_mode = mode_for_size (INTVAL (arg3_rtx) * BITS_PER_UNIT,
+ MODE_INT, 1);
+  if (icode == CODE_FOR_nothing
+  && (!result_eq
+	  || direct_mode == VOIDmode
+	  || direct_mode == BLKmode
+	  || GET_MODE_ALIGNMENT (direct_mode) / BITS_PER_UNIT > min_align))
+return NULL_RTX;
 
-  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
-  location_t loc = EXPR_LOCATION (exp);
   rtx arg1_rtx = get_memory_rtx (arg1, len);
   rtx arg2_rtx = get_memory_rtx (arg2, len);
-  rtx arg3_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
 
   /* Set MEM_SIZE as appropriate.  */
   if (CONST_INT_P (arg3_rtx))
@@ -3739,6 +3751,27 @@ expand_builtin_memcmp (tree exp, rtx tar
   set_mem_size (arg2_rtx, INTVAL (arg3_rtx));
 }
 
+  if (icode == CODE_FOR_nothing)
+{
+  arg1_rtx = change_address (arg1_rtx, direct_mode, XEXP (arg1_rtx, 0));
+  arg2_rtx =