On 10/1/19 4:45 PM, Jeff Law wrote:
> On 9/27/19 12:23 PM, Aaron Sawdey wrote:
>> This is the third piece of my effort to improve inline expansion of memmove. 
>> The
>> first two parts I posted back in June fixed the names of the optab entries
>> involved so that optab cpymem is used for memcpy() and optab movmem is used 
>> for
>> memmove(). This piece adds support for actually attempting to invoke the 
>> movmem
>> optab to do inline expansion of __builtin_memmove().
>>
>> Because what needs to be done for memmove() is very similar to memcpy(), I 
>> have
>> just added a bool parm "might_overlap" to several of the functions involved 
>> so
>> the same functions can handle both. The name might_overlap comes from the 
>> fact
>> that if we still have a memmove() call at expand, this means
>> gimple_fold_builtin_memory_op() was not able to prove that the source and
>> destination do not overlap.
>>
>> There are a few places where might_overlap gets used to keep us from trying 
>> to
>> use the by-pieces infrastructure or generate a copy loop, as neither of those
>> things will work correctly if source and destination overlap.
>>
>> I've restructured things slightly in emit_block_move_hints() so that we can
>> try the pattern first if we already know that by-pieces won't work. This way
>> we can bail out immediately in the might_overlap case.
>>
>> Bootstrap/regtest passed on ppc64le, in progress on x86_64. If everything 
>> passes,
>> is this ok for trunk?
>>
>>
>> 2019-09-27  Aaron Sawdey <acsaw...@linux.ibm.com>
>>
>>      * builtins.c (expand_builtin_memory_copy_args): Add might_overlap parm.
>>      (expand_builtin_memcpy): Use might_overlap parm.
>>      (expand_builtin_mempcpy_args): Use might_overlap parm.
>>      (expand_builtin_memmove): Call expand_builtin_memory_copy_args.
>>      (expand_builtin_memory_copy_args): Add might_overlap parm.
>>      * expr.c (emit_block_move_via_cpymem): Rename to
>>      emit_block_move_via_pattern, add might_overlap parm, use cpymem
>>      or movmem optab as appropriate.
>>      (emit_block_move_hints): Add might_overlap parm, do the right
>>      thing for might_overlap==true.
>>      * expr.h (emit_block_move_hints): Update prototype.
>>
>>
>>
>>
>> Index: gcc/builtins.c
>> ===================================================================
>> --- gcc/builtins.c   (revision 276131)
>> +++ gcc/builtins.c   (working copy)
>> @@ -3894,10 +3897,11 @@
>>                      &probable_max_size);
>>    src_str = c_getstr (src);
>>
>> -  /* If SRC is a string constant and block move would be done
>> -     by pieces, we can avoid loading the string from memory
>> -     and only stored the computed constants.  */
>> -  if (src_str
>> +  /* If SRC is a string constant and block move would be done by
>> +     pieces, we can avoid loading the string from memory and only
>> +     stored the computed constants.  I'm not sure if the by pieces
>> +     method works if src/dest are overlapping, so avoid that case.  */
>> +  if (src_str && !might_overlap
> I don't think you need the check here.  c_getstr, when it returns
> somethign useful is going to be returning a string constant.  Think read
> only literals here.  I'm pretty sure overlap isn't going to be possible.

After some digging, I agree -- c_getstr() return a string constant and
that is then used to generate stores of constants.

I've fixed the other issues and also fixed emit_block_move_via_pattern() to
make use of pieces_ok instead of calling can_move_by_pieces() a second
time. The patch I'm actually committing is below.

Thanks for the review!

  Aaron

Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c      (revision 276131)
+++ gcc/builtins.c      (working copy)
@@ -127,7 +127,8 @@
 static rtx expand_builtin_memcpy (tree, rtx);
 static rtx expand_builtin_memory_copy_args (tree dest, tree src, tree len,
                                            rtx target, tree exp,
-                                           memop_ret retmode);
+                                           memop_ret retmode,
+                                           bool might_overlap);
 static rtx expand_builtin_memmove (tree, rtx);
 static rtx expand_builtin_mempcpy (tree, rtx);
 static rtx expand_builtin_mempcpy_args (tree, tree, tree, rtx, tree, 
memop_ret);
@@ -3790,7 +3791,7 @@
   check_memop_access (exp, dest, src, len);

   return expand_builtin_memory_copy_args (dest, src, len, target, exp,
-                                         /*retmode=*/ RETURN_BEGIN);
+                                         /*retmode=*/ RETURN_BEGIN, false);
 }

 /* Check a call EXP to the memmove built-in for validity.
@@ -3797,7 +3798,7 @@
    Return NULL_RTX on both success and failure.  */

 static rtx
-expand_builtin_memmove (tree exp, rtx)
+expand_builtin_memmove (tree exp, rtx target)
 {
   if (!validate_arglist (exp,
                         POINTER_TYPE, POINTER_TYPE, INTEGER_TYPE, VOID_TYPE))
@@ -3809,7 +3810,8 @@

   check_memop_access (exp, dest, src, len);

-  return NULL_RTX;
+  return expand_builtin_memory_copy_args (dest, src, len, target, exp,
+                                         /*retmode=*/ RETURN_BEGIN, true);
 }

 /* Expand a call EXP to the mempcpy builtin.
@@ -3858,7 +3860,8 @@

 static rtx
 expand_builtin_memory_copy_args (tree dest, tree src, tree len,
-                                rtx target, tree exp, memop_ret retmode)
+                                rtx target, tree exp, memop_ret retmode,
+                                bool might_overlap)
 {
   const char *src_str;
   unsigned int src_align = get_pointer_alignment (src);
@@ -3894,9 +3897,12 @@
                        &probable_max_size);
   src_str = c_getstr (src);

-  /* If SRC is a string constant and block move would be done
-     by pieces, we can avoid loading the string from memory
-     and only stored the computed constants.  */
+  /* If SRC is a string constant and block move would be done by
+     pieces, we can avoid loading the string from memory and only
+     stored the computed constants.  This works in the overlap
+     (memmove) case as well because store_by_pieces just generates a
+     series of stores of constants from the string constant returned
+     by c_getstr().  */
   if (src_str
       && CONST_INT_P (len_rtx)
       && (unsigned HOST_WIDE_INT) INTVAL (len_rtx) <= strlen (src_str) + 1
@@ -3923,6 +3929,7 @@
     method = BLOCK_OP_TAILCALL;
   bool use_mempcpy_call = (targetm.libc_has_fast_function (BUILT_IN_MEMPCPY)
                           && retmode == RETURN_END
+                          && !might_overlap
                           && target != const0_rtx);
   if (use_mempcpy_call)
     method = BLOCK_OP_NO_LIBCALL_RET;
@@ -3929,7 +3936,7 @@
   dest_addr = emit_block_move_hints (dest_mem, src_mem, len_rtx, method,
                                     expected_align, expected_size,
                                     min_size, max_size, probable_max_size,
-                                    use_mempcpy_call, &is_move_done);
+                                    use_mempcpy_call, &is_move_done, 
might_overlap);

   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -3962,7 +3969,7 @@
                             rtx target, tree orig_exp, memop_ret retmode)
 {
   return expand_builtin_memory_copy_args (dest, src, len, target, orig_exp,
-                                         retmode);
+                                         retmode, false);
 }

 /* Expand into a movstr instruction, if one is available.  Return NULL_RTX if
Index: gcc/expr.c
===================================================================
--- gcc/expr.c  (revision 276131)
+++ gcc/expr.c  (working copy)
@@ -73,9 +73,10 @@
 int cse_not_expected;

 static bool block_move_libcall_safe_for_call_parm (void);
-static bool emit_block_move_via_cpymem (rtx, rtx, rtx, unsigned, unsigned, 
HOST_WIDE_INT,
-                                       unsigned HOST_WIDE_INT, unsigned 
HOST_WIDE_INT,
-                                       unsigned HOST_WIDE_INT);
+static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
+                                        HOST_WIDE_INT, unsigned HOST_WIDE_INT,
+                                        unsigned HOST_WIDE_INT,
+                                        unsigned HOST_WIDE_INT, bool);
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
@@ -1562,7 +1563,8 @@
                       unsigned HOST_WIDE_INT min_size,
                       unsigned HOST_WIDE_INT max_size,
                       unsigned HOST_WIDE_INT probable_max_size,
-                      bool bail_out_libcall, bool *is_move_done)
+                      bool bail_out_libcall, bool *is_move_done,
+                      bool might_overlap)
 {
   int may_use_call;
   rtx retval = 0;
@@ -1622,13 +1624,30 @@
       set_mem_size (y, const_size);
     }

-  if (CONST_INT_P (size) && can_move_by_pieces (INTVAL (size), align))
+  bool pieces_ok = can_move_by_pieces (INTVAL (size), align);
+  bool pattern_ok = false;
+
+  if (!CONST_INT_P (size) || !pieces_ok || might_overlap)
+    {
+      pattern_ok =
+       emit_block_move_via_pattern (x, y, size, align,
+                                    expected_align, expected_size,
+                                    min_size, max_size, probable_max_size,
+                                    might_overlap);
+      if (!pattern_ok && might_overlap)
+       {
+         /* Do not try any of the other methods below as they are not safe
+            for overlapping moves.  */
+         *is_move_done = false;
+         return retval;
+       }
+    }
+
+  if (pattern_ok)
+    ;
+  else if (CONST_INT_P (size) && pieces_ok)
     move_by_pieces (x, y, INTVAL (size), align, RETURN_BEGIN);
-  else if (emit_block_move_via_cpymem (x, y, size, align,
-                                      expected_align, expected_size,
-                                      min_size, max_size, probable_max_size))
-    ;
-  else if (may_use_call
+  else if (may_use_call && !might_overlap
           && ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x))
           && ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (y)))
     {
@@ -1645,7 +1664,8 @@
       retval = emit_block_copy_via_libcall (x, y, size,
                                            method == BLOCK_OP_TAILCALL);
     }
-
+  else if (might_overlap)
+    *is_move_done = false;
   else
     emit_block_move_via_loop (x, y, size, align);

@@ -1721,15 +1741,26 @@
   return true;
 }

-/* A subroutine of emit_block_move.  Expand a cpymem pattern;
-   return true if successful.  */
+/* A subroutine of emit_block_move.  Expand a cpymem or movmem pattern;
+   return true if successful.
+
+   X is the destination of the copy or move.
+   Y is the source of the copy or move.
+   SIZE is the size of the block to be moved.

+   MIGHT_OVERLAP indicates this originated with expansion of a
+   builtin_memmove() and the source and destination blocks may
+   overlap.
+  */
+
 static bool
-emit_block_move_via_cpymem (rtx x, rtx y, rtx size, unsigned int align,
-                           unsigned int expected_align, HOST_WIDE_INT 
expected_size,
-                           unsigned HOST_WIDE_INT min_size,
-                           unsigned HOST_WIDE_INT max_size,
-                           unsigned HOST_WIDE_INT probable_max_size)
+emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
+                            unsigned int expected_align,
+                            HOST_WIDE_INT expected_size,
+                            unsigned HOST_WIDE_INT min_size,
+                            unsigned HOST_WIDE_INT max_size,
+                            unsigned HOST_WIDE_INT probable_max_size,
+                            bool might_overlap)
 {
   if (expected_align < align)
     expected_align = align;
@@ -1752,7 +1783,11 @@
   FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
     {
       scalar_int_mode mode = mode_iter.require ();
-      enum insn_code code = direct_optab_handler (cpymem_optab, mode);
+      enum insn_code code;
+      if (might_overlap)
+       code = direct_optab_handler (movmem_optab, mode);
+      else
+       code = direct_optab_handler (cpymem_optab, mode);

       if (code != CODE_FOR_nothing
          /* We don't need MODE to be narrower than BITS_PER_HOST_WIDE_INT
Index: gcc/expr.h
===================================================================
--- gcc/expr.h  (revision 276131)
+++ gcc/expr.h  (working copy)
@@ -116,7 +116,8 @@
                                  unsigned HOST_WIDE_INT,
                                  unsigned HOST_WIDE_INT,
                                  bool bail_out_libcall = false,
-                                 bool *is_move_done = NULL);
+                                 bool *is_move_done = NULL,
+                                 bool might_overlap = false);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
                                 by_pieces_constfn, void *);
 extern bool emit_storent_insn (rtx to, rtx from);


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain

Reply via email to