PR43052 is a PR complaining about how the rep cmpsb expansion that gcc uses for memcmp is slower than the library function. As is so often the case, if you investigate a bit, you can find a lot of issues with the current situation in the compiler.

This PR was accidentally fixed by a patch by Nick which disabled the use of cmpstrnsi for memcmp expansion, on the grounds that cmpstrnsi could stop looking after seeing a null byte, which would be invalid for memcmp, so only cmpmemsi should be used. This fix was for an out-of-tree target.

I believe the rep cmpsb sequence used by i386 would actually be valid, so we could duplicate the cmpstrn pattern to also match cmpmem and be done - but that would then again cause the performance problem described in the PR, so it's probably not a good idea.

One question Richard posed in the comments: why aren't we optimizing small constant size memcmps other than size 1 to *s == *q? The reason is the return value of memcmp, which implies byte-sized operation (incidentally, the use of SImode in the cmpmem/cmpstr patterns is really odd). It's possible to work around this, but expansion becomes a little more tricky (subtract after bswap, maybe). Still, the current code generation is lame.

So, for gcc-6, I think we shouldn't do anything. The PR is fixed, and there's no easy bug-fix that can be done to improve matters. Not sure whether to keep the PR open or create a new one for the remaining issues. For the next stage1, I'm attaching a proof-of-concept patch that does the following:
 * notice if memcmp results are only used for equality comparison
   against zero
 * if so, replace with a different builtin __memcmp_eq
 * Expand __memcmp_eq for small constant sizes with loads and
   comparison, fall back to a memcmp call.

The whole thing could be extended to work for sizes larger than an int, along the lines of memcpy expansion controlled by move ratio etc. Thoughts?


Bernd
Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c	(revision 232359)
+++ gcc/builtins.c	(working copy)
@@ -3699,25 +3699,22 @@ expand_cmpstrn_or_cmpmem (insn_code icod
 
 /* Expand expression EXP, which is a call to the memcmp built-in function.
    Return NULL_RTX if we failed and the caller should emit a normal call,
-   otherwise try to get the result in TARGET, if convenient.  */
+   otherwise try to get the result in TARGET, if convenient.
+   RESULT_EQ is true if we can relax the returned value to be either zero
+   or nonzero, without caring about the sign.  */
 
 static rtx
-expand_builtin_memcmp (tree exp, rtx target)
+expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
 {
   if (!validate_arglist (exp,
  			 POINTER_TYPE, POINTER_TYPE, INTEGER_TYPE, VOID_TYPE))
     return NULL_RTX;
 
-  /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
-     implementing memcmp because it will stop if it encounters two
-     zero bytes.  */
-  insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
-  if (icode == CODE_FOR_nothing)
-    return NULL_RTX;
-
   tree arg1 = CALL_EXPR_ARG (exp, 0);
   tree arg2 = CALL_EXPR_ARG (exp, 1);
   tree len = CALL_EXPR_ARG (exp, 2);
+  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+  location_t loc = EXPR_LOCATION (exp);
 
   unsigned int arg1_align = get_pointer_alignment (arg1) / BITS_PER_UNIT;
   unsigned int arg2_align = get_pointer_alignment (arg2) / BITS_PER_UNIT;
@@ -3725,12 +3722,27 @@ expand_builtin_memcmp (tree exp, rtx tar
   /* If we don't have POINTER_TYPE, call the function.  */
   if (arg1_align == 0 || arg2_align == 0)
     return NULL_RTX;
+  unsigned int min_align = MIN (arg1_align, arg2_align);
+
+  /* Note: The cmpstrnsi pattern, if it exists, is not suitable for
+     implementing memcmp because it will stop if it encounters two
+     zero bytes.  */
+  insn_code icode = direct_optab_handler (cmpmem_optab, SImode);
+
+  rtx arg3_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
+  machine_mode direct_mode = VOIDmode;
+  if (CONST_INT_P (arg3_rtx))
+    direct_mode = mode_for_size (INTVAL (arg3_rtx) * BITS_PER_UNIT,
+				 MODE_INT, 1);
+  if (icode == CODE_FOR_nothing
+      && (!result_eq
+	  || direct_mode == VOIDmode
+	  || direct_mode == BLKmode
+	  || GET_MODE_ALIGNMENT (direct_mode) / BITS_PER_UNIT > min_align))
+    return NULL_RTX;
 
-  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
-  location_t loc = EXPR_LOCATION (exp);
   rtx arg1_rtx = get_memory_rtx (arg1, len);
   rtx arg2_rtx = get_memory_rtx (arg2, len);
-  rtx arg3_rtx = expand_normal (fold_convert_loc (loc, sizetype, len));
 
   /* Set MEM_SIZE as appropriate.  */
   if (CONST_INT_P (arg3_rtx))
@@ -3739,6 +3751,27 @@ expand_builtin_memcmp (tree exp, rtx tar
       set_mem_size (arg2_rtx, INTVAL (arg3_rtx));
     }
 
+  if (icode == CODE_FOR_nothing)
+    {
+      arg1_rtx = change_address (arg1_rtx, direct_mode, XEXP (arg1_rtx, 0));
+      arg2_rtx = change_address (arg2_rtx, direct_mode, XEXP (arg2_rtx, 0));
+      start_sequence ();
+      rtx op0 = force_reg (direct_mode, arg1_rtx);
+      rtx op1 = force_reg (direct_mode, arg2_rtx);
+      rtx tem = emit_store_flag (target, NE, op0, op1,
+				 direct_mode, true, false);
+      rtx seq = get_insns ();
+      end_sequence ();
+      if (tem != 0)
+	{
+	  emit_insn (seq);
+	  return target;
+	}
+    }
+
+  if (icode == CODE_FOR_nothing)
+    return NULL_RTX;
+
   rtx result = expand_cmpstrn_or_cmpmem (icode, target, arg1_rtx, arg2_rtx,
 					 TREE_TYPE (len), arg3_rtx,
 					 MIN (arg1_align, arg2_align));
@@ -5997,9 +6030,16 @@ expand_builtin (tree exp, rtx target, rt
 
     case BUILT_IN_BCMP:
     case BUILT_IN_MEMCMP:
-      target = expand_builtin_memcmp (exp, target);
+    case BUILT_IN_MEMCMP_EQ:
+      target = expand_builtin_memcmp (exp, target, fcode == BUILT_IN_MEMCMP_EQ);
       if (target)
 	return target;
+      if (fcode == BUILT_IN_MEMCMP_EQ)
+	{
+	  exp = copy_node (exp);
+	  tree newdecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
+	  TREE_OPERAND (exp, 1) = build_fold_addr_expr (newdecl);
+	}
       break;
 
     case BUILT_IN_SETJMP:
Index: gcc/builtins.def
===================================================================
--- gcc/builtins.def	(revision 232359)
+++ gcc/builtins.def	(working copy)
@@ -617,6 +617,7 @@ DEF_EXT_LIB_BUILTIN    (BUILT_IN_BZERO,
 DEF_EXT_LIB_BUILTIN    (BUILT_IN_INDEX, "index", BT_FN_STRING_CONST_STRING_INT, ATTR_PURE_NOTHROW_NONNULL_LEAF)
 DEF_LIB_BUILTIN        (BUILT_IN_MEMCHR, "memchr", BT_FN_PTR_CONST_PTR_INT_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
 DEF_LIB_BUILTIN        (BUILT_IN_MEMCMP, "memcmp", BT_FN_INT_CONST_PTR_CONST_PTR_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
+DEF_GCC_BUILTIN        (BUILT_IN_MEMCMP_EQ, "__memcmp_eq", BT_FN_INT_CONST_PTR_CONST_PTR_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
 DEF_LIB_BUILTIN_CHKP   (BUILT_IN_MEMCPY, "memcpy", BT_FN_PTR_PTR_CONST_PTR_SIZE, ATTR_RET1_NOTHROW_NONNULL_LEAF)
 DEF_LIB_BUILTIN_CHKP   (BUILT_IN_MEMMOVE, "memmove", BT_FN_PTR_PTR_CONST_PTR_SIZE, ATTR_RET1_NOTHROW_NONNULL_LEAF)
 DEF_EXT_LIB_BUILTIN_CHKP (BUILT_IN_MEMPCPY, "mempcpy", BT_FN_PTR_PTR_CONST_PTR_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
Index: gcc/gimple-fold.c
===================================================================
--- gcc/gimple-fold.c	(revision 232359)
+++ gcc/gimple-fold.c	(working copy)
@@ -2539,13 +2539,52 @@ gimple_fold_builtin_fprintf (gimple_stmt
   return false;
 }
 
+/* Fold a call to the memcmp builtin.  ARG0, ARG1 and ARG2 are the
+   arguments to the call.
+
+   Return false if no simplification was possible, otherwise return
+   true and replace the call with a simplified form.  */
+
+static bool
+gimple_fold_builtin_memcmp (gimple_stmt_iterator *gsi, tree arg0, tree arg1, tree arg2)
+{
+  gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
+
+  /* If the return value is used, don't do the transformation.  */
+  tree res = gimple_call_lhs (stmt);
+  if (res == NULL_TREE || TREE_CODE (res) != SSA_NAME)
+    return false;
+
+  imm_use_iterator iter;
+  use_operand_p use_p;
+  FOR_EACH_IMM_USE_FAST (use_p, iter, res)
+    {
+      gimple *ustmt = USE_STMT (use_p);
+
+      if (gimple_code (ustmt) != GIMPLE_ASSIGN)
+	return false;
+      gassign *asgn = as_a <gassign *> (ustmt);
+      tree_code code = gimple_assign_rhs_code (asgn);
+      if ((code != EQ_EXPR && code != NE_EXPR)
+	  || !integer_zerop (gimple_assign_rhs2 (asgn)))
+	return false;
+    }
+
+  tree newfn = builtin_decl_explicit (BUILT_IN_MEMCMP_EQ);
+  gcall *repl = gimple_build_call (newfn, 3, arg0, arg1, arg2);
+  gimple_call_set_lhs (repl, res);
+  gimple_set_vuse (repl, gimple_vuse (stmt));
+  replace_call_with_call_and_fold (gsi, repl);
+  return true;
+}
+
 /* Fold a call to the {,v}printf{,_unlocked} and __{,v}printf_chk builtins.
    FMT and ARG are the arguments to the call; we don't fold cases with
    more than 2 arguments, and ARG may be null if this is a 1-argument case.
+   FCODE is the BUILT_IN_* code of the function to be simplified.
 
-   Return NULL_TREE if no simplification was possible, otherwise return the
-   simplified form of the call as a tree.  FCODE is the BUILT_IN_*
-   code of the function to be simplified.  */
+   Return false if no simplification was possible, otherwise return
+   true and replace the call with a simplified form.    */
 
 static bool
 gimple_fold_builtin_printf (gimple_stmt_iterator *gsi, tree fmt,
@@ -2796,6 +2835,10 @@ gimple_fold_builtin (gimple_stmt_iterato
     case BUILT_IN_MEMMOVE:
       return gimple_fold_builtin_memory_op (gsi, gimple_call_arg (stmt, 0),
 					    gimple_call_arg (stmt, 1), 3);
+    case BUILT_IN_MEMCMP:
+      return gimple_fold_builtin_memcmp (gsi, gimple_call_arg (stmt, 0),
+					 gimple_call_arg (stmt, 1),
+					 gimple_call_arg (stmt, 2));
     case BUILT_IN_SPRINTF_CHK:
     case BUILT_IN_VSPRINTF_CHK:
       return gimple_fold_builtin_sprintf_chk (gsi, fcode);

Reply via email to