ping On Tue, Mar 16, 2021 at 06:13:21PM +0100, Stefan Schulze Frielinghaus wrote: > [snip] > > Please find attached a new version of the patch. A major change compared to > the previous patch is that I created a separate pass which hopefully makes > reviewing also easier since it is almost self-contained. After realizing that > detecting loops which mimic the behavior of rawmemchr/strlen functions does > not > really fit into the topic of loop distribution, I created a separate pass. > Due > to this I was also able to play around a bit and schedule the pass at > different > times. Currently it is scheduled right before loop distribution where loop > header copying already took place which leads to the following effect. > Running > this setup over > > char *t (char *p) > { > for (; *p; ++p); > return p; > } > > the new pass transforms > > char * t (char * p) > { > char _1; > char _7; > > <bb 2> [local count: 118111600]: > _7 = *p_3(D); > if (_7 != 0) > goto <bb 5>; [89.00%] > else > goto <bb 7>; [11.00%] > > <bb 5> [local count: 105119324]: > > <bb 3> [local count: 955630225]: > # p_8 = PHI <p_6(6), p_3(D)(5)> > p_6 = p_8 + 1; > _1 = *p_6; > if (_1 != 0) > goto <bb 6>; [89.00%] > else > goto <bb 8>; [11.00%] > > <bb 8> [local count: 105119324]: > # p_2 = PHI <p_6(3)> > goto <bb 4>; [100.00%] > > <bb 6> [local count: 850510901]: > goto <bb 3>; [100.00%] > > <bb 7> [local count: 12992276]: > > <bb 4> [local count: 118111600]: > # p_9 = PHI <p_2(8), p_3(D)(7)> > return p_9; > > } > > into > > char * t (char * p) > { > char * _5; > char _7; > > <bb 2> [local count: 118111600]: > _7 = *p_3(D); > if (_7 != 0) > goto <bb 5>; [89.00%] > else > goto <bb 4>; [11.00%] > > <bb 5> [local count: 105119324]: > _5 = p_3(D) + 1; > p_10 = .RAWMEMCHR (_5, 0); > > <bb 4> [local count: 118111600]: > # p_9 = PHI <p_10(5), p_3(D)(2)> > return p_9; > > } > > which is fine so far. However, I haven't made up my mind so far whether it is > worthwhile to spend more time in order to also eliminate the "first unrolling" > of the loop. I gave it a shot by scheduling the pass prior pass copy header > and ended up with: > > char * t (char * p) > { > <bb 2> [local count: 118111600]: > p_5 = .RAWMEMCHR (p_3(D), 0); > return p_5; > > } > > which seems optimal to me. The downside of this is that I have to initialize > scalar evolution analysis which might be undesired that early. > > All this brings me to the question where do you see this peace of code > running? > If in a separate pass when would you schedule it? If in an existing pass, > which one would you choose? > > Another topic which came up is whether there exists a more elegant solution to > my current implementation in order to deal with stores (I'm speaking of the > `if > (store_dr)` statement inside of function transform_loop_1). For example, > > extern char *p; > char *t () > { > for (; *p; ++p); > return p; > } > > ends up as > > char * t () > { > char * _1; > char * _2; > char _3; > char * p.1_8; > char _9; > char * p.1_10; > char * p.1_11; > > <bb 2> [local count: 118111600]: > p.1_8 = p; > _9 = *p.1_8; > if (_9 != 0) > goto <bb 5>; [89.00%] > else > goto <bb 7>; [11.00%] > > <bb 5> [local count: 105119324]: > > <bb 3> [local count: 955630225]: > # p.1_10 = PHI <_1(6), p.1_8(5)> > _1 = p.1_10 + 1; > p = _1; > _3 = *_1; > if (_3 != 0) > goto <bb 6>; [89.00%] > else > goto <bb 8>; [11.00%] > > <bb 8> [local count: 105119324]: > # _2 = PHI <_1(3)> > goto <bb 4>; [100.00%] > > <bb 6> [local count: 850510901]: > goto <bb 3>; [100.00%] > > <bb 7> [local count: 12992276]: > > <bb 4> [local count: 118111600]: > # p.1_11 = PHI <_2(8), p.1_8(7)> > return p.1_11; > > } > > where inside the loop a load and store occurs. For a rawmemchr like loop I > have to show that we never load from a memory location to which we write. > Currently I solve this by hard coding those facts which are not generic at > all. > I gave compute_data_dependences_for_loop a try which failed to determine the > fact that stores only happen to p[0] and loads from p[i] where i>0. Maybe > there are more generic solutions to express this in contrast to my current > one? > > Thanks again for your input so far. Really appreciated. > > Cheers, > Stefan
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in > index 8a5fb3fd99c..7b2d7405277 100644 > --- a/gcc/Makefile.in > +++ b/gcc/Makefile.in > @@ -1608,6 +1608,7 @@ OBJS = \ > tree-into-ssa.o \ > tree-iterator.o \ > tree-loop-distribution.o \ > + tree-loop-pattern.o \ > tree-nested.o \ > tree-nrv.o \ > tree-object-size.o \ > diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c > index dd7173126fb..957e96a46a4 100644 > --- a/gcc/internal-fn.c > +++ b/gcc/internal-fn.c > @@ -2917,6 +2917,33 @@ expand_VEC_CONVERT (internal_fn, gcall *) > gcc_unreachable (); > } > > +void > +expand_RAWMEMCHR (internal_fn, gcall *stmt) > +{ > + expand_operand ops[3]; > + > + tree lhs = gimple_call_lhs (stmt); > + if (!lhs) > + return; > + tree lhs_type = TREE_TYPE (lhs); > + rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); > + create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type)); > + > + for (unsigned int i = 0; i < 2; ++i) > + { > + tree rhs = gimple_call_arg (stmt, i); > + tree rhs_type = TREE_TYPE (rhs); > + rtx rhs_rtx = expand_normal (rhs); > + create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type)); > + } > + > + insn_code icode = direct_optab_handler (rawmemchr_optab, ops[2].mode); > + > + expand_insn (icode, 3, ops); > + if (!rtx_equal_p (lhs_rtx, ops[0].value)) > + emit_move_insn (lhs_rtx, ops[0].value); > +} > + > /* Expand the IFN_UNIQUE function according to its first argument. */ > > static void > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index daeace7a34e..95c76795648 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -348,6 +348,7 @@ DEF_INTERNAL_FN (MUL_OVERFLOW, ECF_CONST | ECF_LEAF | > ECF_NOTHROW, NULL) > DEF_INTERNAL_FN (TSAN_FUNC_EXIT, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) > DEF_INTERNAL_FN (VA_ARG, ECF_NOTHROW | ECF_LEAF, NULL) > DEF_INTERNAL_FN (VEC_CONVERT, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL) > +DEF_INTERNAL_FN (RAWMEMCHR, ECF_PURE | ECF_LEAF | ECF_NOTHROW, NULL) > > /* An unduplicable, uncombinable function. Generally used to preserve > a CFG property in the face of jump threading, tail merging or > diff --git a/gcc/optabs.def b/gcc/optabs.def > index b192a9d070b..f7c69f914ce 100644 > --- a/gcc/optabs.def > +++ b/gcc/optabs.def > @@ -267,6 +267,7 @@ OPTAB_D (cpymem_optab, "cpymem$a") > OPTAB_D (movmem_optab, "movmem$a") > OPTAB_D (setmem_optab, "setmem$a") > OPTAB_D (strlen_optab, "strlen$a") > +OPTAB_D (rawmemchr_optab, "rawmemchr$I$a") > > OPTAB_DC(fma_optab, "fma$a4", FMA) > OPTAB_D (fms_optab, "fms$a4") > diff --git a/gcc/passes.def b/gcc/passes.def > index e9ed3c7bc57..280e8fc0cde 100644 > --- a/gcc/passes.def > +++ b/gcc/passes.def > @@ -274,6 +274,7 @@ along with GCC; see the file COPYING3. If not see > empty loops. Remove them now. */ > NEXT_PASS (pass_cd_dce, false /* update_address_taken_p */); > NEXT_PASS (pass_iv_canon); > + NEXT_PASS (pass_lpat); > NEXT_PASS (pass_loop_distribution); > NEXT_PASS (pass_linterchange); > NEXT_PASS (pass_copy_prop); > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/lpat-rawmemchr-1.c > b/gcc/testsuite/gcc.dg/tree-ssa/lpat-rawmemchr-1.c > new file mode 100644 > index 00000000000..b4133510fca > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/lpat-rawmemchr-1.c > @@ -0,0 +1,72 @@ > +/* { dg-do run { target s390x-*-* } } */ > +/* { dg-options "-O2 -fdump-tree-lpat-details" } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrqi" 2 "lpat" { > target s390x-*-* } } } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrhi" 2 "lpat" { > target s390x-*-* } } } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrsi" 2 "lpat" { > target s390x-*-* } } } */ > + > +/* Rawmemchr pattern: reduction stmt but no store */ > + > +#include <stdint.h> > +#include <assert.h> > + > +typedef __SIZE_TYPE__ size_t; > +extern void* malloc (size_t); > +extern void* memset (void*, int, size_t); > + > +#define test(T, pattern) \ > +__attribute__((noinline)) \ > +T *test_##T (T *p) \ > +{ \ > + while (*p != (T)pattern) \ > + ++p; \ > + return p; \ > +} > + > +test (uint8_t, 0xab) > +test (uint16_t, 0xabcd) > +test (uint32_t, 0xabcdef15) > + > +test (int8_t, 0xab) > +test (int16_t, 0xabcd) > +test (int32_t, 0xabcdef15) > + > +#define run(T, pattern, i) \ > +{ \ > +T *q = p; \ > +q[i] = (T)pattern; \ > +assert (test_##T (p) == &q[i]); \ > +q[i] = 0; \ > +} > + > +int main(void) > +{ > + void *p = malloc (1024); > + assert (p); > + memset (p, 0, 1024); > + > + run (uint8_t, 0xab, 0); > + run (uint8_t, 0xab, 1); > + run (uint8_t, 0xab, 13); > + > + run (uint16_t, 0xabcd, 0); > + run (uint16_t, 0xabcd, 1); > + run (uint16_t, 0xabcd, 13); > + > + run (uint32_t, 0xabcdef15, 0); > + run (uint32_t, 0xabcdef15, 1); > + run (uint32_t, 0xabcdef15, 13); > + > + run (int8_t, 0xab, 0); > + run (int8_t, 0xab, 1); > + run (int8_t, 0xab, 13); > + > + run (int16_t, 0xabcd, 0); > + run (int16_t, 0xabcd, 1); > + run (int16_t, 0xabcd, 13); > + > + run (int32_t, 0xabcdef15, 0); > + run (int32_t, 0xabcdef15, 1); > + run (int32_t, 0xabcdef15, 13); > + > + return 0; > +} > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/lpat-rawmemchr-2.c > b/gcc/testsuite/gcc.dg/tree-ssa/lpat-rawmemchr-2.c > new file mode 100644 > index 00000000000..9bebec11db0 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/lpat-rawmemchr-2.c > @@ -0,0 +1,83 @@ > +/* { dg-do run { target s390x-*-* } } */ > +/* { dg-options "-O2 -fdump-tree-lpat-details" } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrqi" 2 "lpat" { > target s390x-*-* } } } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrhi" 2 "lpat" { > target s390x-*-* } } } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrsi" 2 "lpat" { > target s390x-*-* } } } */ > + > +/* Rawmemchr pattern: reduction stmt and store */ > + > +#include <stdint.h> > +#include <assert.h> > + > +typedef __SIZE_TYPE__ size_t; > +extern void* malloc (size_t); > +extern void* memset (void*, int, size_t); > + > +uint8_t *p_uint8_t; > +uint16_t *p_uint16_t; > +uint32_t *p_uint32_t; > + > +int8_t *p_int8_t; > +int16_t *p_int16_t; > +int32_t *p_int32_t; > + > +#define test(T, pattern) \ > +__attribute__((noinline)) \ > +T *test_##T (void) \ > +{ \ > + while (*p_##T != pattern) \ > + ++p_##T; \ > + return p_##T; \ > +} > + > +test (uint8_t, 0xab) > +test (uint16_t, 0xabcd) > +test (uint32_t, 0xabcdef15) > + > +test (int8_t, (int8_t)0xab) > +test (int16_t, (int16_t)0xabcd) > +test (int32_t, (int32_t)0xabcdef15) > + > +#define run(T, pattern, i) \ > +{ \ > +T *q = p; \ > +q[i] = pattern; \ > +p_##T = p; \ > +T *r = test_##T (); \ > +assert (r == p_##T); \ > +assert (r == &q[i]); \ > +q[i] = 0; \ > +} > + > +int main(void) > +{ > + void *p = malloc (1024); > + assert (p); > + memset (p, '\0', 1024); > + > + run (uint8_t, 0xab, 0); > + run (uint8_t, 0xab, 1); > + run (uint8_t, 0xab, 13); > + > + run (uint16_t, 0xabcd, 0); > + run (uint16_t, 0xabcd, 1); > + run (uint16_t, 0xabcd, 13); > + > + run (uint32_t, 0xabcdef15, 0); > + run (uint32_t, 0xabcdef15, 1); > + run (uint32_t, 0xabcdef15, 13); > + > + run (int8_t, (int8_t)0xab, 0); > + run (int8_t, (int8_t)0xab, 1); > + run (int8_t, (int8_t)0xab, 13); > + > + run (int16_t, (int16_t)0xabcd, 0); > + run (int16_t, (int16_t)0xabcd, 1); > + run (int16_t, (int16_t)0xabcd, 13); > + > + run (int32_t, (int32_t)0xabcdef15, 0); > + run (int32_t, (int32_t)0xabcdef15, 1); > + run (int32_t, (int32_t)0xabcdef15, 13); > + > + return 0; > +} > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/lpat-strlen-1.c > b/gcc/testsuite/gcc.dg/tree-ssa/lpat-strlen-1.c > new file mode 100644 > index 00000000000..b02509c2c8c > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/lpat-strlen-1.c > @@ -0,0 +1,100 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fdump-tree-lpat-details" } */ > +/* { dg-final { scan-tree-dump-times "generated strlen\n" 4 "lpat" } } */ > +/* { dg-final { scan-tree-dump-times "generated strlen using rawmemchrhi\n" > 4 "lpat" { target s390x-*-* } } } */ > +/* { dg-final { scan-tree-dump-times "generated strlen using rawmemchrsi\n" > 4 "lpat" { target s390x-*-* } } } */ > + > +#include <stdint.h> > +#include <assert.h> > + > +typedef __SIZE_TYPE__ size_t; > +extern void* malloc (size_t); > +extern void* memset (void*, int, size_t); > + > +#define test(T, U) \ > +__attribute__((noinline)) \ > +U test_##T##U (T *s) \ > +{ \ > + U i; \ > + for (i=0; s[i]; ++i); \ > + return i; \ > +} > + > +test (uint8_t, size_t) > +test (uint16_t, size_t) > +test (uint32_t, size_t) > +test (uint8_t, int) > +test (uint16_t, int) > +test (uint32_t, int) > + > +test (int8_t, size_t) > +test (int16_t, size_t) > +test (int32_t, size_t) > +test (int8_t, int) > +test (int16_t, int) > +test (int32_t, int) > + > +#define run(T, U, i) \ > +{ \ > +T *q = p; \ > +q[i] = 0; \ > +assert (test_##T##U (p) == i); \ > +memset (&q[i], 0xf, sizeof (T)); \ > +} > + > +int main(void) > +{ > + void *p = malloc (1024); > + assert (p); > + memset (p, 0xf, 1024); > + > + run (uint8_t, size_t, 0); > + run (uint8_t, size_t, 1); > + run (uint8_t, size_t, 13); > + > + run (int8_t, size_t, 0); > + run (int8_t, size_t, 1); > + run (int8_t, size_t, 13); > + > + run (uint8_t, int, 0); > + run (uint8_t, int, 1); > + run (uint8_t, int, 13); > + > + run (int8_t, int, 0); > + run (int8_t, int, 1); > + run (int8_t, int, 13); > + > + run (uint16_t, size_t, 0); > + run (uint16_t, size_t, 1); > + run (uint16_t, size_t, 13); > + > + run (int16_t, size_t, 0); > + run (int16_t, size_t, 1); > + run (int16_t, size_t, 13); > + > + run (uint16_t, int, 0); > + run (uint16_t, int, 1); > + run (uint16_t, int, 13); > + > + run (int16_t, int, 0); > + run (int16_t, int, 1); > + run (int16_t, int, 13); > + > + run (uint32_t, size_t, 0); > + run (uint32_t, size_t, 1); > + run (uint32_t, size_t, 13); > + > + run (int32_t, size_t, 0); > + run (int32_t, size_t, 1); > + run (int32_t, size_t, 13); > + > + run (uint32_t, int, 0); > + run (uint32_t, int, 1); > + run (uint32_t, int, 13); > + > + run (int32_t, int, 0); > + run (int32_t, int, 1); > + run (int32_t, int, 13); > + > + return 0; > +} > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/lpat-strlen-2.c > b/gcc/testsuite/gcc.dg/tree-ssa/lpat-strlen-2.c > new file mode 100644 > index 00000000000..e71dad8ed2e > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/lpat-strlen-2.c > @@ -0,0 +1,58 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fdump-tree-lpat-details" } */ > +/* { dg-final { scan-tree-dump-times "generated strlen\n" 3 "lpat" } } */ > + > +#include <assert.h> > + > +typedef __SIZE_TYPE__ size_t; > +extern void* malloc (size_t); > +extern void* memset (void*, int, size_t); > + > +__attribute__((noinline)) > +int test_pos (char *s) > +{ > + int i; > + for (i=42; s[i]; ++i); > + return i; > +} > + > +__attribute__((noinline)) > +int test_neg (char *s) > +{ > + int i; > + for (i=-42; s[i]; ++i); > + return i; > +} > + > +__attribute__((noinline)) > +int test_including_null_char (char *s) > +{ > + int i; > + for (i=1; s[i-1]; ++i); > + return i; > +} > + > +int main(void) > +{ > + void *p = malloc (1024); > + assert (p); > + memset (p, 0xf, 1024); > + char *s = (char *)p + 100; > + > + s[42+13] = 0; > + assert (test_pos (s) == 42+13); > + s[42+13] = 0xf; > + > + s[13] = 0; > + assert (test_neg (s) == 13); > + s[13] = 0xf; > + > + s[-13] = 0; > + assert (test_neg (s) == -13); > + s[-13] = 0xf; > + > + s[13] = 0; > + assert (test_including_null_char (s) == 13+1); > + > + return 0; > +} > diff --git a/gcc/timevar.def b/gcc/timevar.def > index 63c0b3306de..bdefc85fbb4 100644 > --- a/gcc/timevar.def > +++ b/gcc/timevar.def > @@ -307,6 +307,7 @@ DEFTIMEVAR (TV_TREE_UBSAN , "tree ubsan") > DEFTIMEVAR (TV_INITIALIZE_RTL , "initialize rtl") > DEFTIMEVAR (TV_GIMPLE_LADDRESS , "address lowering") > DEFTIMEVAR (TV_TREE_LOOP_IFCVT , "tree loop if-conversion") > +DEFTIMEVAR (TV_LPAT , "tree loop pattern") > > /* Everything else in rest_of_compilation not included above. */ > DEFTIMEVAR (TV_EARLY_LOCAL , "early local passes") > diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c > index 7ee19fc8677..f7aafd0d0dc 100644 > --- a/gcc/tree-loop-distribution.c > +++ b/gcc/tree-loop-distribution.c > @@ -890,7 +890,7 @@ loop_distribution::partition_merge_into (struct graph > *rdg, > /* Returns true when DEF is an SSA_NAME defined in LOOP and used after > the LOOP. */ > > -static bool > +bool > ssa_name_has_uses_outside_loop_p (tree def, loop_p loop) > { > imm_use_iterator imm_iter; > @@ -912,7 +912,7 @@ ssa_name_has_uses_outside_loop_p (tree def, loop_p loop) > /* Returns true when STMT defines a scalar variable used after the > loop LOOP. */ > > -static bool > +bool > stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt) > { > def_operand_p def_p; > @@ -1234,7 +1234,7 @@ generate_memcpy_builtin (class loop *loop, partition > *partition) > > /* Remove and destroy the loop LOOP. */ > > -static void > +void > destroy_loop (class loop *loop) > { > unsigned nbbs = loop->num_nodes; > diff --git a/gcc/tree-loop-pattern.c b/gcc/tree-loop-pattern.c > new file mode 100644 > index 00000000000..a9c984d5e53 > --- /dev/null > +++ b/gcc/tree-loop-pattern.c > @@ -0,0 +1,588 @@ > +#include "config.h" > +#include "system.h" > +#include "coretypes.h" > +#include "backend.h" > +#include "intl.h" > +#include "tree.h" > +#include "gimple.h" > +#include "cfghooks.h" > +#include "tree-pass.h" > +#include "ssa.h" > +#include "fold-const.h" > +#include "gimple-iterator.h" > +#include "gimplify-me.h" > +#include "tree-cfg.h" > +#include "tree-ssa.h" > +#include "tree-ssanames.h" > +#include "tree-ssa-loop.h" > +#include "tree-ssa-loop-manip.h" > +#include "tree-into-ssa.h" > +#include "cfgloop.h" > +#include "tree-scalar-evolution.h" > +#include "tree-vectorizer.h" > +#include "tree-eh.h" > +#include "gimple-fold.h" > +#include "rtl.h" > +#include "memmodel.h" > +#include "insn-codes.h" > +#include "optabs.h" > + > +/* This pass detects loops which mimic the effects of builtins and replaces > them > + accordingly. For example, a loop of the form > + > + for (; *p != 42; ++p); > + > + is replaced by > + > + p = rawmemchr (p, 42); > + > + under the assumption that rawmemchr is available for a particular mode. > + Another example is > + > + int i; > + for (i = 42; s[i]; ++i); > + > + which is replaced by > + > + i = (int)strlen (&s[42]) + 42; > + > + for some character array S. In case array S is not of a character array > + type, we end up with > + > + i = (int)(rawmemchr (&s[42], 0) - &s[42]) + 42; > + > + assuming that rawmemchr is available for a particular mode. Note, > detecting > + strlen like loops also depends on whether the type for the resulting > length > + is compatible with size type or overflow is undefined. */ > + > +/* TODO Quick and dirty imports from tree-loop-distribution pass. */ > +void destroy_loop (class loop *loop); > +bool stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt); > +bool ssa_name_has_uses_outside_loop_p (tree def, loop_p loop); > + > +static void > +generate_rawmemchr_builtin (loop_p loop, tree reduction_var, > + data_reference_p store_dr, tree base, tree pattern, > + location_t loc) > +{ > + gcc_checking_assert (POINTER_TYPE_P (TREE_TYPE (base)) > + && TREE_TYPE (TREE_TYPE (base)) == TREE_TYPE (pattern)); > + gcc_checking_assert (TREE_TYPE (reduction_var) == TREE_TYPE (base)); > + > + /* The new statements will be placed before LOOP. */ > + gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src); > + > + tree mem = force_gimple_operand_gsi (&gsi, base, true, NULL_TREE, false, > + GSI_CONTINUE_LINKING); > + gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, > pattern); > + tree reduction_var_new = copy_ssa_name (reduction_var); > + gimple_call_set_lhs (fn_call, reduction_var_new); > + gimple_set_location (fn_call, loc); > + gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); > + > + if (store_dr) > + { > + gassign *g = gimple_build_assign (DR_REF (store_dr), > reduction_var_new); > + gsi_insert_after (&gsi, g, GSI_CONTINUE_LINKING); > + } > + > + imm_use_iterator iter; > + gimple *stmt; > + use_operand_p use_p; > + FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var) > + { > + FOR_EACH_IMM_USE_ON_STMT (use_p, iter) > + SET_USE (use_p, reduction_var_new); > + > + update_stmt (stmt); > + } > + > + fold_stmt (&gsi); > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + switch (TYPE_MODE (TREE_TYPE (pattern))) > + { > + case QImode: > + fprintf (dump_file, "generated rawmemchrqi\n"); > + break; > + > + case HImode: > + fprintf (dump_file, "generated rawmemchrhi\n"); > + break; > + > + case SImode: > + fprintf (dump_file, "generated rawmemchrsi\n"); > + break; > + > + default: > + gcc_unreachable (); > + } > +} > + > +static void > +generate_strlen_builtin (loop_p loop, tree reduction_var, tree base, > + tree start_len, location_t loc) > +{ > + gcc_checking_assert (POINTER_TYPE_P (TREE_TYPE (base))); > + gcc_checking_assert (TREE_TYPE (reduction_var) == TREE_TYPE (start_len)); > + > + /* The new statements will be placed before LOOP. */ > + gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src); > + > + tree reduction_var_new = make_ssa_name (size_type_node); > + > + tree mem = force_gimple_operand_gsi (&gsi, base, true, NULL_TREE, false, > + GSI_CONTINUE_LINKING); > + tree fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_STRLEN)); > + gimple *fn_call = gimple_build_call (fn, 1, mem); > + gimple_call_set_lhs (fn_call, reduction_var_new); > + gimple_set_location (fn_call, loc); > + gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); > + > + /* In case reduction type is not compatible with size type, then > + conversion is sound even in case an overflow occurs since we previously > + ensured that for reduction type an overflow is undefined. */ > + tree convert = fold_convert (TREE_TYPE (reduction_var), reduction_var_new); > + reduction_var_new = force_gimple_operand_gsi (&gsi, convert, true, > NULL_TREE, > + false, GSI_CONTINUE_LINKING); > + > + /* Loops of the form `for (i=42; s[i]; ++i);` have an additional start > + length. */ > + if (!integer_zerop (start_len)) > + { > + tree fn_result = reduction_var_new; > + reduction_var_new = make_ssa_name (TREE_TYPE (reduction_var)); > + gimple *add_stmt = gimple_build_assign (reduction_var_new, PLUS_EXPR, > + fn_result, start_len); > + gsi_insert_after (&gsi, add_stmt, GSI_CONTINUE_LINKING); > + } > + > + imm_use_iterator iter; > + gimple *stmt; > + use_operand_p use_p; > + FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var) > + { > + FOR_EACH_IMM_USE_ON_STMT (use_p, iter) > + SET_USE (use_p, reduction_var_new); > + > + update_stmt (stmt); > + } > + > + fold_stmt (&gsi); > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + fprintf (dump_file, "generated strlen\n"); > +} > + > +static void > +generate_strlen_builtin_using_rawmemchr (loop_p loop, tree reduction_var, > + tree base, tree start_len, > + location_t loc) > +{ > + gcc_checking_assert (POINTER_TYPE_P (TREE_TYPE (base))); > + gcc_checking_assert (TREE_TYPE (reduction_var) == TREE_TYPE (start_len)); > + > + /* The new statements will be placed before LOOP. */ > + gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src); > + > + tree mem = force_gimple_operand_gsi (&gsi, base, true, NULL_TREE, false, > + GSI_CONTINUE_LINKING); > + tree zero = build_zero_cst (TREE_TYPE (TREE_TYPE (mem))); > + gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, zero); > + tree end = make_ssa_name (TREE_TYPE (base)); > + gimple_call_set_lhs (fn_call, end); > + gimple_set_location (fn_call, loc); > + gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); > + > + tree diff = make_ssa_name (ptrdiff_type_node); > + gimple *diff_stmt = gimple_build_assign (diff, POINTER_DIFF_EXPR, end, > mem); > + gsi_insert_after (&gsi, diff_stmt, GSI_CONTINUE_LINKING); > + > + tree convert = fold_convert (ptrdiff_type_node, > + TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (mem)))); > + tree size = force_gimple_operand_gsi (&gsi, convert, true, NULL_TREE, > false, > + GSI_CONTINUE_LINKING); > + > + tree count = make_ssa_name (ptrdiff_type_node); > + gimple *count_stmt = gimple_build_assign (count, TRUNC_DIV_EXPR, diff, > size); > + gsi_insert_after (&gsi, count_stmt, GSI_CONTINUE_LINKING); > + > + convert = fold_convert (TREE_TYPE (reduction_var), count); > + tree reduction_var_new = force_gimple_operand_gsi (&gsi, convert, true, > + NULL_TREE, false, > + GSI_CONTINUE_LINKING); > + > + /* Loops of the form `for (i=42; s[i]; ++i);` have an additional start > + length. */ > + if (!integer_zerop (start_len)) > + { > + tree lhs = make_ssa_name (TREE_TYPE (reduction_var_new)); > + gimple *g = gimple_build_assign (lhs, PLUS_EXPR, reduction_var_new, > + start_len); > + gsi_insert_after (&gsi, g, GSI_CONTINUE_LINKING); > + reduction_var_new = lhs; > + } > + > + imm_use_iterator iter; > + gimple *stmt; > + use_operand_p use_p; > + FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var) > + { > + FOR_EACH_IMM_USE_ON_STMT (use_p, iter) > + SET_USE (use_p, reduction_var_new); > + > + update_stmt (stmt); > + } > + > + fold_stmt (&gsi); > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + switch (TYPE_MODE (TREE_TYPE (zero))) > + { > + case HImode: > + fprintf (dump_file, "generated strlen using rawmemchrhi\n"); > + break; > + > + case SImode: > + fprintf (dump_file, "generated strlen using rawmemchrsi\n"); > + break; > + > + default: > + gcc_unreachable (); > + } > +} > + > +static bool > +transform_loop_1 (loop_p loop, > + data_reference_p load_dr, > + data_reference_p store_dr, > + tree reduction_var) > +{ > + tree load_ref = DR_REF (load_dr); > + tree load_type = TREE_TYPE (load_ref); > + tree load_access_base = build_fold_addr_expr (load_ref); > + tree load_access_size = TYPE_SIZE_UNIT (load_type); > + affine_iv load_iv, reduction_iv; > + tree pattern; > + > + /* A limitation of the current implementation is that we only support > + constant patterns. */ > + edge e = single_exit (loop); > + gcond *cond_stmt = safe_dyn_cast <gcond *> (last_stmt (e->src)); > + if (!cond_stmt) > + return false; > + pattern = gimple_cond_rhs (cond_stmt); > + if (gimple_cond_code (cond_stmt) != NE_EXPR > + || gimple_cond_lhs (cond_stmt) != gimple_assign_lhs (DR_STMT (load_dr)) > + || TREE_CODE (pattern) != INTEGER_CST) > + return false; > + > + /* Bail out if no affine induction variable with constant step can be > + determined. */ > + if (!simple_iv (loop, loop, load_access_base, &load_iv, false)) > + return false; > + > + /* Bail out if memory accesses are not consecutive or not growing. */ > + if (!operand_equal_p (load_iv.step, load_access_size, 0)) > + return false; > + > + if (!INTEGRAL_TYPE_P (load_type) > + || !type_has_mode_precision_p (load_type)) > + return false; > + > + if (!simple_iv (loop, loop, reduction_var, &reduction_iv, false)) > + return false; > + > + /* Handle rawmemchr like loops. */ > + if (operand_equal_p (load_iv.base, reduction_iv.base) > + && operand_equal_p (load_iv.step, reduction_iv.step)) > + { > + if (store_dr) > + { > + /* Ensure that we store to X and load from X+I where I>0. */ > + if (TREE_CODE (load_iv.base) != POINTER_PLUS_EXPR > + || !integer_onep (TREE_OPERAND (load_iv.base, 1))) > + return false; > + tree ptr_base = TREE_OPERAND (load_iv.base, 0); > + if (TREE_CODE (ptr_base) != SSA_NAME) > + return false; > + gimple *def = SSA_NAME_DEF_STMT (ptr_base); > + if (!gimple_assign_single_p (def) > + || gimple_assign_rhs1 (def) != DR_REF (store_dr)) > + return false; > + /* Ensure that the reduction value is stored. */ > + if (gimple_assign_rhs1 (DR_STMT (store_dr)) != reduction_var) > + return false; > + } > + /* Bail out if target does not provide rawmemchr for a certain mode. > */ > + machine_mode mode = TYPE_MODE (load_type); > + if (direct_optab_handler (rawmemchr_optab, mode) == CODE_FOR_nothing) > + return false; > + location_t loc = gimple_location (DR_STMT (load_dr)); > + generate_rawmemchr_builtin (loop, reduction_var, store_dr, > load_iv.base, > + pattern, loc); > + return true; > + } > + > + /* Handle strlen like loops. */ > + if (store_dr == NULL > + && integer_zerop (pattern) > + && TREE_CODE (reduction_iv.base) == INTEGER_CST > + && TREE_CODE (reduction_iv.step) == INTEGER_CST > + && integer_onep (reduction_iv.step) > + && (types_compatible_p (TREE_TYPE (reduction_var), size_type_node) > + || TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (reduction_var)))) > + { > + location_t loc = gimple_location (DR_STMT (load_dr)); > + if (TYPE_MODE (load_type) == TYPE_MODE (char_type_node) > + && TYPE_PRECISION (load_type) == TYPE_PRECISION (char_type_node)) > + generate_strlen_builtin (loop, reduction_var, load_iv.base, > + reduction_iv.base, loc); > + else if (direct_optab_handler (rawmemchr_optab, TYPE_MODE (load_type)) > + != CODE_FOR_nothing) > + generate_strlen_builtin_using_rawmemchr (loop, reduction_var, > + load_iv.base, > + reduction_iv.base, loc); > + else > + return false; > + if (TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (reduction_var))) > + { > + const char *msg = G_("assuming signed overflow does not occur " > + "when optimizing strlen like loop"); > + fold_overflow_warning (msg, WARN_STRICT_OVERFLOW_MISC); > + } > + return true; > + } > + > + return false; > +} > + > +static bool > +transform_loop (loop_p loop) > +{ > + gimple *reduction_stmt = NULL; > + data_reference_p load_dr = NULL, store_dr = NULL; > + > + basic_block *bbs = get_loop_body (loop); > + > + for (unsigned i = 0, ninsns = 0; i < loop->num_nodes; ++i) > + { > + basic_block bb = bbs[i]; > + > + for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); > + gsi_next (&bsi), ++ninsns) > + { > + /* Bail out early for loops which are unlikely to match. */ > + if (ninsns > 16) > + return false; > + gphi *phi = bsi.phi (); > + if (gimple_has_volatile_ops (phi)) > + return false; > + if (gimple_clobber_p (phi)) > + continue; > + if (virtual_operand_p (gimple_phi_result (phi))) > + continue; > + if (stmt_has_scalar_dependences_outside_loop (loop, phi)) > + { > + if (reduction_stmt) > + return false; > + reduction_stmt = phi; > + } > + } > + > + for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); > + gsi_next (&bsi), ++ninsns) > + { > + /* Bail out early for loops which are unlikely to match. */ > + if (ninsns > 16) > + return false; > + > + gimple *stmt = gsi_stmt (bsi); > + > + if (gimple_clobber_p (stmt)) > + continue; > + > + if (gimple_code (stmt) == GIMPLE_LABEL || is_gimple_debug (stmt)) > + continue; > + > + if (gimple_has_volatile_ops (stmt)) > + return false; > + > + if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) > + { > + if (reduction_stmt) > + return false; > + reduction_stmt = stmt; > + } > + > + /* Any scalar stmts are ok. */ > + if (!gimple_vuse (stmt)) > + continue; > + > + /* Otherwise just regular loads/stores. */ > + if (!gimple_assign_single_p (stmt)) > + return false; > + > + auto_vec<data_reference_p, 2> dr_vec; > + if (!find_data_references_in_stmt (loop, stmt, &dr_vec)) > + return false; > + data_reference_p dr; > + unsigned j; > + FOR_EACH_VEC_ELT (dr_vec, j, dr) > + { > + tree type = TREE_TYPE (DR_REF (dr)); > + if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type))) > + return false; > + if (DR_IS_READ (dr)) > + { > + if (load_dr != NULL) > + return false; > + load_dr = dr; > + } > + else > + { > + if (store_dr != NULL) > + return false; > + store_dr = dr; > + } > + } > + } > + } > + > + /* A limitation of the current implementation is that we require a > reduction > + statement which does not occur in cases like > + extern int *p; > + void foo (void) { for (; *p; ++p); } */ > + if (load_dr == NULL || reduction_stmt == NULL) > + return false; > + > + /* Note, reduction variables are guaranteed to be SSA names. */ > + tree reduction_var; > + switch (gimple_code (reduction_stmt)) > + { > + case GIMPLE_PHI: > + reduction_var = gimple_phi_result (reduction_stmt); > + break; > + case GIMPLE_ASSIGN: > + reduction_var = gimple_assign_lhs (reduction_stmt); > + break; > + default: > + /* Bail out e.g. for GIMPLE_CALL. */ > + return false; > + } > + if (reduction_var == NULL) > + return false; > + > + /* Bail out if this is a bitfield memory reference. */ > + if (TREE_CODE (DR_REF (load_dr)) == COMPONENT_REF > + && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (load_dr), 1))) > + return false; > + > + /* Data reference must be executed exactly once per iteration of each > + loop in the loop nest. We only need to check dominance information > + against the outermost one in a perfect loop nest because a bb can't > + dominate outermost loop's latch without dominating inner loop's. */ > + basic_block load_bb = gimple_bb (DR_STMT (load_dr)); > + if (!dominated_by_p (CDI_DOMINATORS, loop->latch, load_bb)) > + return false; > + > + if (store_dr) > + { > + /* Bail out if this is a bitfield memory reference. */ > + if (TREE_CODE (DR_REF (store_dr)) == COMPONENT_REF > + && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (store_dr), 1))) > + return false; > + > + /* Data reference must be executed exactly once per iteration of each > + loop in the loop nest. We only need to check dominance information > + against the outermost one in a perfect loop nest because a bb can't > + dominate outermost loop's latch without dominating inner loop's. */ > + basic_block store_bb = gimple_bb (DR_STMT (store_dr)); > + if (!dominated_by_p (CDI_DOMINATORS, loop->latch, store_bb)) > + return false; > + > + /* Load and store must be in the same loop nest. */ > + if (store_bb->loop_father != load_bb->loop_father) > + return false; > + > + edge e = single_exit (store_bb->loop_father); > + if (!e) > + return false; > + bool load_dom = dominated_by_p (CDI_DOMINATORS, e->src, load_bb); > + bool store_dom = dominated_by_p (CDI_DOMINATORS, e->src, store_bb); > + if (load_dom != store_dom) > + return false; > + } > + > + return transform_loop_1 (loop, load_dr, store_dr, reduction_var); > +} > + > +namespace { > + > +const pass_data pass_data_lpat = > +{ > + GIMPLE_PASS, /* type */ > + "lpat", /* name */ > + OPTGROUP_LOOP, /* optinfo_flags */ > + TV_LPAT, /* tv_id */ > + ( PROP_cfg | PROP_ssa ), /* properties_required */ > + 0, /* properties_provided */ > + 0, /* properties_destroyed */ > + 0, /* todo_flags_start */ > + 0, /* todo_flags_finish */ > +}; > + > +class pass_lpat : public gimple_opt_pass > +{ > +public: > + pass_lpat (gcc::context *ctxt) > + : gimple_opt_pass (pass_data_lpat, ctxt) > + {} > + > + bool > + gate (function *) OVERRIDE > + { > + return optimize != 0; > + } > + > + unsigned int > + execute (function *f) OVERRIDE > + { > + loop_p loop; > + auto_vec<loop_p> loops_to_be_destroyed; > + > + FOR_EACH_LOOP_FN (f, loop, LI_ONLY_INNERMOST) > + { > + if (!single_exit (loop) > + || (!flag_tree_loop_distribute_patterns // TODO > + && !optimize_loop_for_speed_p (loop))) > + continue; > + > + if (transform_loop (loop)) > + loops_to_be_destroyed.safe_push (loop); > + } > + > + if (loops_to_be_destroyed.length () > 0) > + { > + unsigned i; > + FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop) > + destroy_loop (loop); > + > + scev_reset_htab (); > + mark_virtual_operands_for_renaming (f); > + rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); > + > + return TODO_cleanup_cfg; > + } > + else > + return 0; > + } > +}; // class pass_lpat > + > +} // anon namespace > + > +gimple_opt_pass * > +make_pass_lpat (gcc::context *ctxt) > +{ > + return new pass_lpat (ctxt); > +} > diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h > index 15693fee150..2d71a12039e 100644 > --- a/gcc/tree-pass.h > +++ b/gcc/tree-pass.h > @@ -380,6 +380,7 @@ extern gimple_opt_pass *make_pass_graphite (gcc::context > *ctxt); > extern gimple_opt_pass *make_pass_graphite_transforms (gcc::context *ctxt); > extern gimple_opt_pass *make_pass_if_conversion (gcc::context *ctxt); > extern gimple_opt_pass *make_pass_if_to_switch (gcc::context *ctxt); > +extern gimple_opt_pass *make_pass_lpat (gcc::context *ctxt); > extern gimple_opt_pass *make_pass_loop_distribution (gcc::context *ctxt); > extern gimple_opt_pass *make_pass_vectorize (gcc::context *ctxt); > extern gimple_opt_pass *make_pass_simduid_cleanup (gcc::context *ctxt);