On Sun, May 17, 2020 at 7:06 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > Duplicate the cmpstrn pattern for cmpmem. The only difference is that > the length argument of cmpmem is guaranteed to be less than or equal to > lengths of 2 memory areas. Since "repz cmpsb" can be much slower than > memcmp function implemented with vector instruction, see > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052 > > expand cmpmem to "repz cmpsb" only with -mgeneral-regs-only.
If there is no benefit compared to the library implementation, then enable these patterns only when -minline-all-stringops is used. Eventually these should be reimplemented with SSE4 string instructions. Honza is the author of the block handling x86 system, I'll leave the review to him. Uros. > Tested on Linux/x86 and Linux/x86-64. OK for master? > > Thanks. > > H.J. > --- > gcc/ > > PR target/95151 > * config/i386/i386-expand.c (ix86_expand_cmpstrn_or_cmpmem): New > function. Duplicated from the cmpstrn pattern. Expand memcmp > to "repz cmpsb" only with -mgeneral-regs-only. > * config/i386/i386-protos.h (ix86_expand_cmpstrn_or_cmpmem): New > prototype. > * config/i386/i386.md (cmpmemsi): New pattern. > (cmpstrnsi): Use ix86_expand_cmpstrn_or_cmpmem. > > gcc/testsuite/ > > PR target/95151 > * gcc.target/i386/pr95151-1.c: New test. > * gcc.target/i386/pr95151-2.c: Likewise. > * gcc.target/i386/pr95151-3.c: Likewise. > * gcc.target/i386/pr95151-4.c: Likewise. > --- > gcc/config/i386/i386-expand.c | 80 +++++++++++++++++++++++ > gcc/config/i386/i386-protos.h | 1 + > gcc/config/i386/i386.md | 80 ++++++----------------- > gcc/testsuite/gcc.target/i386/pr95151-1.c | 18 +++++ > gcc/testsuite/gcc.target/i386/pr95151-2.c | 11 ++++ > gcc/testsuite/gcc.target/i386/pr95151-3.c | 18 +++++ > gcc/testsuite/gcc.target/i386/pr95151-4.c | 11 ++++ > 7 files changed, 160 insertions(+), 59 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr95151-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr95151-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr95151-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr95151-4.c > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > index 26531585c5f..05d3c33720c 100644 > --- a/gcc/config/i386/i386-expand.c > +++ b/gcc/config/i386/i386-expand.c > @@ -7651,6 +7651,86 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx > count_exp, rtx val_exp, > return true; > } > > +/* Expand cmpstrn or memcmp. */ > + > +bool > +ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2, > + rtx length, rtx align, bool is_cmpstrn) > +{ > + if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS) > + return false; > + > + /* Can't use this if the user has appropriated ecx, esi or edi. */ > + if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG]) > + return false; > + > + if (is_cmpstrn) > + { > + /* For strncmp, length is the maximum length, which can be larger > + than actual string lengths. We can expand the cmpstrn pattern > + to "repz cmpsb" only if one of the strings is a constant so > + that expand_builtin_strncmp() can write the length argument to > + be the minimum of the const string length and the actual length > + argument. Otherwise, "repz cmpsb" may pass the 0 byte. */ > + tree t1 = MEM_EXPR (src1); > + tree t2 = MEM_EXPR (src2); > + if (!((t1 && TREE_CODE (t1) == MEM_REF > + && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR > + && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0)) > + == STRING_CST)) > + || (t2 && TREE_CODE (t2) == MEM_REF > + && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR > + && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0)) > + == STRING_CST)))) > + return false; > + } > + else > + { > + /* Expand memcmp to "repz cmpsb" only for -mgeneral-regs-only > + since "repz cmpsb" can be much slower than memcmp function > + implemented with vector instructions, see > + > + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052 > + */ > + if (!TARGET_GENERAL_REGS_ONLY) > + return false; > + } > + > + rtx addr1 = copy_addr_to_reg (XEXP (src1, 0)); > + rtx addr2 = copy_addr_to_reg (XEXP (src2, 0)); > + if (addr1 != XEXP (src1, 0)) > + src1 = replace_equiv_address_nv (src1, addr1); > + if (addr2 != XEXP (src2, 0)) > + src2 = replace_equiv_address_nv (src2, addr2); > + > + rtx lengthreg = ix86_zero_extend_to_Pmode (length); > + > + /* If we are testing strict equality, we can use known alignment to > + good advantage. This may be possible with combine, particularly > + once cc0 is dead. */ > + if (CONST_INT_P (length)) > + { > + if (length == const0_rtx) > + { > + emit_move_insn (result, const0_rtx); > + return true; > + } > + emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align, > + src1, src2)); > + } > + else > + { > + emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg)); > + emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align, > + src1, src2)); > + } > + > + rtx out = gen_lowpart (QImode, result); > + emit_insn (gen_cmpintqi (out)); > + emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out)); > + > + return true; > +} > > /* Expand the appropriate insns for doing strlen if not just doing > repnz; scasb > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 39fcaa0ad5f..238aa650b61 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -71,6 +71,7 @@ extern int avx_vperm2f128_parallel (rtx par, machine_mode > mode); > extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx); > extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx, > rtx, rtx, rtx, rtx, bool); > +extern bool ix86_expand_cmpstrn_or_cmpmem (rtx, rtx, rtx, rtx, rtx, bool); > > extern bool constant_address_p (rtx); > extern bool legitimate_pic_operand_p (rtx); > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 1bf0c1a7f01..7fb97db1e6e 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -17761,6 +17761,22 @@ (define_insn "*rep_stosqi" > (const_string "*"))) > (set_attr "mode" "QI")]) > > +(define_expand "cmpmemsi" > + [(set (match_operand:SI 0 "register_operand" "") > + (compare:SI (match_operand:BLK 1 "memory_operand" "") > + (match_operand:BLK 2 "memory_operand" "") ) ) > + (use (match_operand 3 "general_operand")) > + (use (match_operand 4 "immediate_operand"))] > + "" > +{ > + if (ix86_expand_cmpstrn_or_cmpmem (operands[0], operands[1], > + operands[2], operands[3], > + operands[4], false)) > + DONE; > + else > + FAIL; > +}) > + > (define_expand "cmpstrnsi" > [(set (match_operand:SI 0 "register_operand") > (compare:SI (match_operand:BLK 1 "general_operand") > @@ -17769,66 +17785,12 @@ (define_expand "cmpstrnsi" > (use (match_operand 4 "immediate_operand"))] > "" > { > - rtx addr1, addr2, countreg, align, out; > - > - if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS) > - FAIL; > - > - /* Can't use this if the user has appropriated ecx, esi or edi. */ > - if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG]) > - FAIL; > - > - /* One of the strings must be a constant. If so, expand_builtin_strncmp() > - will have rewritten the length arg to be the minimum of the const string > - length and the actual length arg. If both strings are the same and > - shorter than the length arg, repz cmpsb will not stop at the 0 byte and > - will incorrectly base the results on chars past the 0 byte. */ > - tree t1 = MEM_EXPR (operands[1]); > - tree t2 = MEM_EXPR (operands[2]); > - if (!((t1 && TREE_CODE (t1) == MEM_REF > - && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR > - && TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0)) == STRING_CST) > - || (t2 && TREE_CODE (t2) == MEM_REF > - && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR > - && TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0)) == > STRING_CST))) > - FAIL; > - > - addr1 = copy_addr_to_reg (XEXP (operands[1], 0)); > - addr2 = copy_addr_to_reg (XEXP (operands[2], 0)); > - if (addr1 != XEXP (operands[1], 0)) > - operands[1] = replace_equiv_address_nv (operands[1], addr1); > - if (addr2 != XEXP (operands[2], 0)) > - operands[2] = replace_equiv_address_nv (operands[2], addr2); > - > - countreg = ix86_zero_extend_to_Pmode (operands[3]); > - > - /* %%% Iff we are testing strict equality, we can use known alignment > - to good advantage. This may be possible with combine, particularly > - once cc0 is dead. */ > - align = operands[4]; > - > - if (CONST_INT_P (operands[3])) > - { > - if (operands[3] == const0_rtx) > - { > - emit_move_insn (operands[0], const0_rtx); > - DONE; > - } > - emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countreg, align, > - operands[1], operands[2])); > - } > + if (ix86_expand_cmpstrn_or_cmpmem (operands[0], operands[1], > + operands[2], operands[3], > + operands[4], true)) > + DONE; > else > - { > - emit_insn (gen_cmp_1 (Pmode, countreg, countreg)); > - emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countreg, align, > - operands[1], operands[2])); > - } > - > - out = gen_lowpart (QImode, operands[0]); > - emit_insn (gen_cmpintqi (out)); > - emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, out)); > - > - DONE; > + FAIL; > }) > > ;; Produce a tri-state integer (-1, 0, 1) from condition codes. > diff --git a/gcc/testsuite/gcc.target/i386/pr95151-1.c > b/gcc/testsuite/gcc.target/i386/pr95151-1.c > new file mode 100644 > index 00000000000..f7ed88e508f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr95151-1.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mgeneral-regs-only -minline-all-stringops" } */ > + > +struct foo > +{ > + char array[257]; > +}; > + > +extern struct foo x; > + > +int > +func (struct foo i) > +{ > + return __builtin_memcmp (&x, &i, sizeof (x)) ? 1 : 2; > +} > + > +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcmp" } } */ > +/* { dg-final { scan-assembler "repz cmpsb" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr95151-2.c > b/gcc/testsuite/gcc.target/i386/pr95151-2.c > new file mode 100644 > index 00000000000..afd811ae7fb > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr95151-2.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mgeneral-regs-only -minline-all-stringops" } */ > + > +int > +func (void *d, void *s, unsigned int l) > +{ > + return __builtin_memcmp (d, s, l) ? 1 : 2; > +} > + > +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcmp" } } */ > +/* { dg-final { scan-assembler "repz cmpsb" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr95151-3.c > b/gcc/testsuite/gcc.target/i386/pr95151-3.c > new file mode 100644 > index 00000000000..bd62db5e640 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr95151-3.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -minline-all-stringops" } */ > + > +struct foo > +{ > + char array[257]; > +}; > + > +extern struct foo x; > + > +int > +func (struct foo i) > +{ > + return __builtin_memcmp (&x, &i, sizeof (x)) ? 1 : 2; > +} > + > +/* { dg-final { scan-assembler "call\[\\t \]*_?memcmp" } } */ > +/* { dg-final { scan-assembler-not "repz cmpsb" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr95151-4.c > b/gcc/testsuite/gcc.target/i386/pr95151-4.c > new file mode 100644 > index 00000000000..b8258f2530a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr95151-4.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -minline-all-stringops" } */ > + > +int > +func (void *d, void *s, unsigned int l) > +{ > + return __builtin_memcmp (d, s, l) ? 1 : 2; > +} > + > +/* { dg-final { scan-assembler "call\[\\t \]*_?memcmp" } } */ > +/* { dg-final { scan-assembler-not "repz cmpsb" } } */ > -- > 2.26.2 >