Re: [PATCH] x86: Add -mavoid-libcall for -mgeneral-regs-only

2020-05-20 Thread Jeff Law via Gcc-patches
On Fri, 2020-05-15 at 08:11 +0200, Uros Bizjak via Gcc-patches wrote:
> On Fri, May 15, 2020 at 1:13 AM H.J. Lu  wrote:
> > The -mgeneral-regs-only option generates code that uses only the
> > general-purpose registers.  It prevents the compiler from using vector
> > registers.  But GCC may still generate calls to memcpy, memmove, memset
> > and memcmp library functions.  In the GNU C library, these library
> > functions are implementated with vector registers, which makes the
> > -mgeneral-regs-only option less effective.  The new -mavoid-libcall
> > option expands memcpy, memmove and memset into REP MOVSB and REP STOSB
> > sequence.  This option can be further enhanced with a cmpmem pattern
> > to expand memcmp into REP CMPSB sequence in the future.
> > 
> > Tested on Linux/x86 and Linux/x86-64.  OK for master?
> 
> No. Library should provide functions that are appropriate for your
> target. There are probably other places in the library that use XMM
> registers, so there is no point working around only some specific
> functions.
Couldn't one make the argument that we should be using -ffreestanding and that
-ffreestanding should be emitting the necessary inline code rather than calling
out to memcpy or whatever it's doing.

jeff
> 



Re: [PATCH] x86: Add -mavoid-libcall for -mgeneral-regs-only

2020-05-15 Thread Richard Biener via Gcc-patches
On Fri, May 15, 2020 at 8:27 AM Uros Bizjak via Gcc-patches
 wrote:
>
> On Fri, May 15, 2020 at 1:13 AM H.J. Lu  wrote:
> >
> > The -mgeneral-regs-only option generates code that uses only the
> > general-purpose registers.  It prevents the compiler from using vector
> > registers.  But GCC may still generate calls to memcpy, memmove, memset
> > and memcmp library functions.  In the GNU C library, these library
> > functions are implementated with vector registers, which makes the
> > -mgeneral-regs-only option less effective.  The new -mavoid-libcall
> > option expands memcpy, memmove and memset into REP MOVSB and REP STOSB
> > sequence.  This option can be further enhanced with a cmpmem pattern
> > to expand memcmp into REP CMPSB sequence in the future.
> >
> > Tested on Linux/x86 and Linux/x86-64.  OK for master?
>
> No. Library should provide functions that are appropriate for your
> target. There are probably other places in the library that use XMM
> registers, so there is no point working around only some specific
> functions.

For those specific functions -minline-all-stringops should also work, no?

Richard.

> Uros.
>
> > Thanks.
> >
> > H.J.
> > ---
> > gcc/
> >
> > PR target/95134
> > * config/i386/i386-expand.c (alg_usable_p): Return false for
> > libcall with -mavoid-libcall.
> > (decide_alg): Avoid libcall and rep_prefix_1_byte instead of
> > libcall with -mavoid-libcall.
> > * config/i386/i386.opt: Add -mavoid-libcall.
> > * doc/invoke.texi: Document -mavoid-libcall.
> >
> > gcc/testsuite/
> >
> > PR target/95134
> > * gcc.target/i386/pr95134-1.c: New test.
> > * gcc.target/i386/pr95134-2.c: Likewise.
> > * gcc.target/i386/pr95134-3.c: Likewise.
> > * gcc.target/i386/pr95134-4.c: Likewise.
> > ---
> >  gcc/config/i386/i386-expand.c | 15 ++-
> >  gcc/config/i386/i386.opt  |  6 +-
> >  gcc/doc/invoke.texi   | 10 +-
> >  gcc/testsuite/gcc.target/i386/pr95134-1.c | 18 ++
> >  gcc/testsuite/gcc.target/i386/pr95134-2.c | 18 ++
> >  gcc/testsuite/gcc.target/i386/pr95134-3.c | 18 ++
> >  gcc/testsuite/gcc.target/i386/pr95134-4.c | 11 +++
> >  7 files changed, 89 insertions(+), 7 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-4.c
> >
> > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> > index 26531585c5f..b38463bf88c 100644
> > --- a/gcc/config/i386/i386-expand.c
> > +++ b/gcc/config/i386/i386-expand.c
> > @@ -6816,7 +6816,7 @@ alg_usable_p (enum stringop_alg alg, bool memset, 
> > bool have_as)
> >   || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
> > return false;
> >  }
> > -  return true;
> > +  return !flag_avoid_libcall || alg != libcall;
> >  }
> >
> >  /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  
> > */
> > @@ -6889,7 +6889,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
> > expected_size,
> >   setup.  */
> >else if (expected_size != -1 && expected_size < 4)
> >  return loop_1_byte;
> > -  else if (expected_size != -1)
> > +  else if (expected_size != -1 && !flag_avoid_libcall)
> >  {
> >enum stringop_alg alg = libcall;
> >bool alg_noalign = false;
> > @@ -6934,6 +6934,9 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
> > expected_size,
> > }
> > }
> >  }
> > +
> > +  enum stringop_alg alg;
> > +
> >/* When asked to inline the call anyway, try to pick meaningful choice.
> >   We look for maximal size of block that is faster to copy by hand and
> >   take blocks of at most of that size guessing that average size will
> > @@ -6945,7 +6948,6 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
> > expected_size,
> >&& (algs->unknown_size == libcall
> >   || !alg_usable_p (algs->unknown_size, memset, have_as)))
> >  {
> > -  enum stringop_alg alg;
> >HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
> >
> >/* If there aren't any usable algorithms or if recursing already,
> > @@ -6967,8 +6969,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
> > expected_size,
> > gcc_assert (alg != libcall);
> >return alg;
> >  }
> > -  return (alg_usable_p (algs->unknown_size, memset, have_as)
> > - ? algs->unknown_size : libcall);
> > +  alg = (alg_usable_p (algs->unknown_size, memset, have_as)
> > +? algs->unknown_size : libcall);
> > +  if (flag_avoid_libcall && alg == libcall)
> > +alg = rep_prefix_1_byte;
> > +  return alg;
> >  }
> >
> >  /* Decide on alignment.  We know that the operand is already aligned to 

Re: [PATCH] x86: Add -mavoid-libcall for -mgeneral-regs-only

2020-05-15 Thread Uros Bizjak via Gcc-patches
On Fri, May 15, 2020 at 1:13 AM H.J. Lu  wrote:
>
> The -mgeneral-regs-only option generates code that uses only the
> general-purpose registers.  It prevents the compiler from using vector
> registers.  But GCC may still generate calls to memcpy, memmove, memset
> and memcmp library functions.  In the GNU C library, these library
> functions are implementated with vector registers, which makes the
> -mgeneral-regs-only option less effective.  The new -mavoid-libcall
> option expands memcpy, memmove and memset into REP MOVSB and REP STOSB
> sequence.  This option can be further enhanced with a cmpmem pattern
> to expand memcmp into REP CMPSB sequence in the future.
>
> Tested on Linux/x86 and Linux/x86-64.  OK for master?

No. Library should provide functions that are appropriate for your
target. There are probably other places in the library that use XMM
registers, so there is no point working around only some specific
functions.

Uros.

> Thanks.
>
> H.J.
> ---
> gcc/
>
> PR target/95134
> * config/i386/i386-expand.c (alg_usable_p): Return false for
> libcall with -mavoid-libcall.
> (decide_alg): Avoid libcall and rep_prefix_1_byte instead of
> libcall with -mavoid-libcall.
> * config/i386/i386.opt: Add -mavoid-libcall.
> * doc/invoke.texi: Document -mavoid-libcall.
>
> gcc/testsuite/
>
> PR target/95134
> * gcc.target/i386/pr95134-1.c: New test.
> * gcc.target/i386/pr95134-2.c: Likewise.
> * gcc.target/i386/pr95134-3.c: Likewise.
> * gcc.target/i386/pr95134-4.c: Likewise.
> ---
>  gcc/config/i386/i386-expand.c | 15 ++-
>  gcc/config/i386/i386.opt  |  6 +-
>  gcc/doc/invoke.texi   | 10 +-
>  gcc/testsuite/gcc.target/i386/pr95134-1.c | 18 ++
>  gcc/testsuite/gcc.target/i386/pr95134-2.c | 18 ++
>  gcc/testsuite/gcc.target/i386/pr95134-3.c | 18 ++
>  gcc/testsuite/gcc.target/i386/pr95134-4.c | 11 +++
>  7 files changed, 89 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-4.c
>
> diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> index 26531585c5f..b38463bf88c 100644
> --- a/gcc/config/i386/i386-expand.c
> +++ b/gcc/config/i386/i386-expand.c
> @@ -6816,7 +6816,7 @@ alg_usable_p (enum stringop_alg alg, bool memset, bool 
> have_as)
>   || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
> return false;
>  }
> -  return true;
> +  return !flag_avoid_libcall || alg != libcall;
>  }
>
>  /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
> @@ -6889,7 +6889,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
> expected_size,
>   setup.  */
>else if (expected_size != -1 && expected_size < 4)
>  return loop_1_byte;
> -  else if (expected_size != -1)
> +  else if (expected_size != -1 && !flag_avoid_libcall)
>  {
>enum stringop_alg alg = libcall;
>bool alg_noalign = false;
> @@ -6934,6 +6934,9 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
> expected_size,
> }
> }
>  }
> +
> +  enum stringop_alg alg;
> +
>/* When asked to inline the call anyway, try to pick meaningful choice.
>   We look for maximal size of block that is faster to copy by hand and
>   take blocks of at most of that size guessing that average size will
> @@ -6945,7 +6948,6 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
> expected_size,
>&& (algs->unknown_size == libcall
>   || !alg_usable_p (algs->unknown_size, memset, have_as)))
>  {
> -  enum stringop_alg alg;
>HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
>
>/* If there aren't any usable algorithms or if recursing already,
> @@ -6967,8 +6969,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
> expected_size,
> gcc_assert (alg != libcall);
>return alg;
>  }
> -  return (alg_usable_p (algs->unknown_size, memset, have_as)
> - ? algs->unknown_size : libcall);
> +  alg = (alg_usable_p (algs->unknown_size, memset, have_as)
> +? algs->unknown_size : libcall);
> +  if (flag_avoid_libcall && alg == libcall)
> +alg = rep_prefix_1_byte;
> +  return alg;
>  }
>
>  /* Decide on alignment.  We know that the operand is already aligned to ALIGN
> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> index c9f7195d423..23b401bd424 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -1114,4 +1114,8 @@ Support SERIALIZE built-in functions and code 
> generation.
>
>  mtsxldtrk
>  Target Report Mask(ISA2_TSXLDTRK) Var(ix86_isa_flags2) Save
> -Support TSXLDTRK built-in functions and 

[PATCH] x86: Add -mavoid-libcall for -mgeneral-regs-only

2020-05-14 Thread H.J. Lu via Gcc-patches
The -mgeneral-regs-only option generates code that uses only the
general-purpose registers.  It prevents the compiler from using vector
registers.  But GCC may still generate calls to memcpy, memmove, memset
and memcmp library functions.  In the GNU C library, these library
functions are implementated with vector registers, which makes the
-mgeneral-regs-only option less effective.  The new -mavoid-libcall
option expands memcpy, memmove and memset into REP MOVSB and REP STOSB
sequence.  This option can be further enhanced with a cmpmem pattern
to expand memcmp into REP CMPSB sequence in the future.

Tested on Linux/x86 and Linux/x86-64.  OK for master?

Thanks.

H.J.
---
gcc/

PR target/95134
* config/i386/i386-expand.c (alg_usable_p): Return false for
libcall with -mavoid-libcall.
(decide_alg): Avoid libcall and rep_prefix_1_byte instead of
libcall with -mavoid-libcall.
* config/i386/i386.opt: Add -mavoid-libcall.
* doc/invoke.texi: Document -mavoid-libcall.

gcc/testsuite/

PR target/95134
* gcc.target/i386/pr95134-1.c: New test.
* gcc.target/i386/pr95134-2.c: Likewise.
* gcc.target/i386/pr95134-3.c: Likewise.
* gcc.target/i386/pr95134-4.c: Likewise.
---
 gcc/config/i386/i386-expand.c | 15 ++-
 gcc/config/i386/i386.opt  |  6 +-
 gcc/doc/invoke.texi   | 10 +-
 gcc/testsuite/gcc.target/i386/pr95134-1.c | 18 ++
 gcc/testsuite/gcc.target/i386/pr95134-2.c | 18 ++
 gcc/testsuite/gcc.target/i386/pr95134-3.c | 18 ++
 gcc/testsuite/gcc.target/i386/pr95134-4.c | 11 +++
 7 files changed, 89 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr95134-4.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 26531585c5f..b38463bf88c 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -6816,7 +6816,7 @@ alg_usable_p (enum stringop_alg alg, bool memset, bool 
have_as)
  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
return false;
 }
-  return true;
+  return !flag_avoid_libcall || alg != libcall;
 }
 
 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
@@ -6889,7 +6889,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
expected_size,
  setup.  */
   else if (expected_size != -1 && expected_size < 4)
 return loop_1_byte;
-  else if (expected_size != -1)
+  else if (expected_size != -1 && !flag_avoid_libcall)
 {
   enum stringop_alg alg = libcall;
   bool alg_noalign = false;
@@ -6934,6 +6934,9 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
expected_size,
}
}
 }
+
+  enum stringop_alg alg;
+
   /* When asked to inline the call anyway, try to pick meaningful choice.
  We look for maximal size of block that is faster to copy by hand and
  take blocks of at most of that size guessing that average size will
@@ -6945,7 +6948,6 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
expected_size,
   && (algs->unknown_size == libcall
  || !alg_usable_p (algs->unknown_size, memset, have_as)))
 {
-  enum stringop_alg alg;
   HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
 
   /* If there aren't any usable algorithms or if recursing already,
@@ -6967,8 +6969,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT 
expected_size,
gcc_assert (alg != libcall);
   return alg;
 }
-  return (alg_usable_p (algs->unknown_size, memset, have_as)
- ? algs->unknown_size : libcall);
+  alg = (alg_usable_p (algs->unknown_size, memset, have_as)
+? algs->unknown_size : libcall);
+  if (flag_avoid_libcall && alg == libcall)
+alg = rep_prefix_1_byte;
+  return alg;
 }
 
 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index c9f7195d423..23b401bd424 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1114,4 +1114,8 @@ Support SERIALIZE built-in functions and code generation.
 
 mtsxldtrk
 Target Report Mask(ISA2_TSXLDTRK) Var(ix86_isa_flags2) Save
-Support TSXLDTRK built-in functions and code generation.
\ No newline at end of file
+Support TSXLDTRK built-in functions and code generation.
+
+mavoid-libcall
+Target Report Var(flag_avoid_libcall) Init(0)
+Avoid generation of libcall.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 850aeac033d..0d2d70419d5 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1364,7 +1364,7 @@ See RS/6000 and PowerPC Options.
 -mstack-protector-guard-reg=@var{reg} @gol
 -mstack-protector-guard-offset=@var{offset}