PR43052 is a PR complaining about how the rep cmpsb expansion that gcc
uses for memcmp is slower than the library function. As is so often the
case, if you investigate a bit, you can find a lot of issues with the
current situation in the compiler.
This PR was accidentally fixed by a patch by Nick which disabled the use
of cmpstrnsi for memcmp expansion, on the grounds that cmpstrnsi could
stop looking after seeing a null byte, which would be invalid for
memcmp, so only cmpmemsi should be used. This fix was for an out-of-tree
target.
I believe the rep cmpsb sequence used by i386 would actually be valid,
so we could duplicate the cmpstrn pattern to also match cmpmem and be
done - but that would then again cause the performance problem described
in the PR, so it's probably not a good idea.
One question Richard posed in the comments: why aren't we optimizing
small constant size memcmps other than size 1 to *s == *q? The reason is
the return value of memcmp, which implies byte-sized operation
(incidentally, the use of SImode in the cmpmem/cmpstr patterns is really
odd). It's possible to work around this, but expansion becomes a little
more tricky (subtract after bswap, maybe). Still, the current code
generation is lame.
So, for gcc-6, I think we shouldn't do anything. The PR is fixed, and
there's no easy bug-fix that can be done to improve matters. Not sure
whether to keep the PR open or create a new one for the remaining
issues. For the next stage1, I'm attaching a proof-of-concept patch that
does the following:
* notice if memcmp results are only used for equality comparison
against zero
* if so, replace with a different builtin __memcmp_eq
* Expand __memcmp_eq for small constant sizes with loads and
comparison, fall back to a memcmp call.
The whole thing could be extended to work for sizes larger than an int,
along the lines of memcpy expansion controlled by move ratio etc. Thoughts?
Bernd
Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c (revision 232359)
+++ gcc/builtins.c (working copy)
@@ -3699,25 +3699,22 @@ expand_cmpstrn_or_cmpmem (insn_code icod
/* Expand expression EXP, which is a call to the memcmp built-in function.
Return NULL_RTX if we failed and the caller should emit a normal call,
- otherwise try to get the result in TARGET, if convenient. */
+ otherwise try to get the result in TARGET, if convenient.
+ RESULT_EQ is true if we can relax the returned value to be either zero
+ or nonzero, without caring about the sign. */
static rtx
-expand_builtin_memcmp (tree exp, rtx target)
+expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
{
if (!validate_arglist (exp,
POINTER_TYPE, POINTER_TYPE, INTEGER_TYPE, VOID_TYPE))
return NULL_RTX;
- /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
- implementing memcmp because it will stop if it encounters two
- zero bytes. */
- insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
- if (icode == CODE_FOR_nothing)
- return NULL_RTX;
-
tree arg1 = CALL_EXPR_ARG (exp, 0);
tree arg2 = CALL_EXPR_ARG (exp, 1);
tree len = CALL_EXPR_ARG (exp, 2);
+ machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+ location_t loc = EXPR_LOCATION (exp);
unsigned int arg1_align = get_pointer_alignment (arg1) / BITS_PER_UNIT;
unsigned int arg2_align = get_pointer_alignment (arg2) / BITS_PER_UNIT;
@@ -3725,12 +3722,27 @@ expand_builtin_memcmp (tree exp, rtx tar
/* If we don't have POINTER_TYPE, call the function. */
if (arg1_align == 0 || arg2_align == 0)
return NULL_RTX;
+ unsigned int min_align = MIN (arg1_align, arg2_align);
+
+ /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
+ implementing memcmp because it will stop if it encounters two
+ zero bytes. */
+ insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
+
+ rtx arg3_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
+ machine_mode direct_mode = VOIDmode;
+ if (CONST_INT_P (arg3_rtx))
+ direct_mode = mode_for_size (INTVAL (arg3_rtx) * BITS_PER_UNIT,
+ MODE_INT, 1);
+ if (icode == CODE_FOR_nothing
+ && (!result_eq
+ || direct_mode == VOIDmode
+ || direct_mode == BLKmode
+ || GET_MODE_ALIGNMENT (direct_mode) / BITS_PER_UNIT > min_align))
+ return NULL_RTX;
- machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
- location_t loc = EXPR_LOCATION (exp);
rtx arg1_rtx = get_memory_rtx (arg1, len);
rtx arg2_rtx = get_memory_rtx (arg2, len);
- rtx arg3_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
/* Set MEM_SIZE as appropriate. */
if (CONST_INT_P (arg3_rtx))
@@ -3739,6 +3751,27 @@ expand_builtin_memcmp (tree exp, rtx tar
set_mem_size (arg2_rtx, INTVAL (arg3_rtx));
}
+ if (icode == CODE_FOR_nothing)
+ {
+ arg1_rtx = change_address (arg1_rtx, direct_mode, XEXP (arg1_rtx, 0));
+ arg2_rtx = change_address (arg2_rtx, direct_mode, XEXP (arg2_rtx, 0));
+ start_sequence ();
+ rtx op0 = force_reg (direct_mode, arg1_rtx);
+ rtx op1 = force_reg (direct_mode, arg2_rtx);
+ rtx tem = emit_store_flag (target, NE, op0, op1,
+ direct_mode, true, false);
+ rtx seq = get_insns ();
+ end_sequence ();
+ if (tem != 0)
+ {
+ emit_insn (seq);
+ return target;
+ }
+ }
+
+ if (icode == CODE_FOR_nothing)
+ return NULL_RTX;
+
rtx result = expand_cmpstrn_or_cmpmem (icode, target, arg1_rtx, arg2_rtx,
TREE_TYPE (len), arg3_rtx,
MIN (arg1_align, arg2_align));
@@ -5997,9 +6030,16 @@ expand_builtin (tree exp, rtx target, rt
case BUILT_IN_BCMP:
case BUILT_IN_MEMCMP:
- target = expand_builtin_memcmp (exp, target);
+ case BUILT_IN_MEMCMP_EQ:
+ target = expand_builtin_memcmp (exp, target, fcode == BUILT_IN_MEMCMP_EQ);
if (target)
return target;
+ if (fcode == BUILT_IN_MEMCMP_EQ)
+ {
+ exp = copy_node (exp);
+ tree newdecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
+ TREE_OPERAND (exp, 1) = build_fold_addr_expr (newdecl);
+ }
break;
case BUILT_IN_SETJMP:
Index: gcc/builtins.def
===================================================================
--- gcc/builtins.def (revision 232359)
+++ gcc/builtins.def (working copy)
@@ -617,6 +617,7 @@ DEF_EXT_LIB_BUILTIN (BUILT_IN_BZERO,
DEF_EXT_LIB_BUILTIN (BUILT_IN_INDEX, "index", BT_FN_STRING_CONST_STRING_INT, ATTR_PURE_NOTHROW_NONNULL_LEAF)
DEF_LIB_BUILTIN (BUILT_IN_MEMCHR, "memchr", BT_FN_PTR_CONST_PTR_INT_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
DEF_LIB_BUILTIN (BUILT_IN_MEMCMP, "memcmp", BT_FN_INT_CONST_PTR_CONST_PTR_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
+DEF_GCC_BUILTIN (BUILT_IN_MEMCMP_EQ, "__memcmp_eq", BT_FN_INT_CONST_PTR_CONST_PTR_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
DEF_LIB_BUILTIN_CHKP (BUILT_IN_MEMCPY, "memcpy", BT_FN_PTR_PTR_CONST_PTR_SIZE, ATTR_RET1_NOTHROW_NONNULL_LEAF)
DEF_LIB_BUILTIN_CHKP (BUILT_IN_MEMMOVE, "memmove", BT_FN_PTR_PTR_CONST_PTR_SIZE, ATTR_RET1_NOTHROW_NONNULL_LEAF)
DEF_EXT_LIB_BUILTIN_CHKP (BUILT_IN_MEMPCPY, "mempcpy", BT_FN_PTR_PTR_CONST_PTR_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
Index: gcc/gimple-fold.c
===================================================================
--- gcc/gimple-fold.c (revision 232359)
+++ gcc/gimple-fold.c (working copy)
@@ -2539,13 +2539,52 @@ gimple_fold_builtin_fprintf (gimple_stmt
return false;
}
+/* Fold a call to the memcmp builtin. ARG0, ARG1 and ARG2 are the
+ arguments to the call.
+
+ Return false if no simplification was possible, otherwise return
+ true and replace the call with a simplified form. */
+
+static bool
+gimple_fold_builtin_memcmp (gimple_stmt_iterator *gsi, tree arg0, tree arg1, tree arg2)
+{
+ gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
+
+ /* If the return value is used, don't do the transformation. */
+ tree res = gimple_call_lhs (stmt);
+ if (res == NULL_TREE || TREE_CODE (res) != SSA_NAME)
+ return false;
+
+ imm_use_iterator iter;
+ use_operand_p use_p;
+ FOR_EACH_IMM_USE_FAST (use_p, iter, res)
+ {
+ gimple *ustmt = USE_STMT (use_p);
+
+ if (gimple_code (ustmt) != GIMPLE_ASSIGN)
+ return false;
+ gassign *asgn = as_a <gassign *> (ustmt);
+ tree_code code = gimple_assign_rhs_code (asgn);
+ if ((code != EQ_EXPR && code != NE_EXPR)
+ || !integer_zerop (gimple_assign_rhs2 (asgn)))
+ return false;
+ }
+
+ tree newfn = builtin_decl_explicit (BUILT_IN_MEMCMP_EQ);
+ gcall *repl = gimple_build_call (newfn, 3, arg0, arg1, arg2);
+ gimple_call_set_lhs (repl, res);
+ gimple_set_vuse (repl, gimple_vuse (stmt));
+ replace_call_with_call_and_fold (gsi, repl);
+ return true;
+}
+
/* Fold a call to the {,v}printf{,_unlocked} and __{,v}printf_chk builtins.
FMT and ARG are the arguments to the call; we don't fold cases with
more than 2 arguments, and ARG may be null if this is a 1-argument case.
+ FCODE is the BUILT_IN_* code of the function to be simplified.
- Return NULL_TREE if no simplification was possible, otherwise return the
- simplified form of the call as a tree. FCODE is the BUILT_IN_*
- code of the function to be simplified. */
+ Return false if no simplification was possible, otherwise return
+ true and replace the call with a simplified form. */
static bool
gimple_fold_builtin_printf (gimple_stmt_iterator *gsi, tree fmt,
@@ -2796,6 +2835,10 @@ gimple_fold_builtin (gimple_stmt_iterato
case BUILT_IN_MEMMOVE:
return gimple_fold_builtin_memory_op (gsi, gimple_call_arg (stmt, 0),
gimple_call_arg (stmt, 1), 3);
+ case BUILT_IN_MEMCMP:
+ return gimple_fold_builtin_memcmp (gsi, gimple_call_arg (stmt, 0),
+ gimple_call_arg (stmt, 1),
+ gimple_call_arg (stmt, 2));
case BUILT_IN_SPRINTF_CHK:
case BUILT_IN_VSPRINTF_CHK:
return gimple_fold_builtin_sprintf_chk (gsi, fcode);