On Thu, Jul 1, 2021 at 8:22 AM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> Changes in the v5 patches:
>
> 1. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> scratch register to avoid stack realignment when expanding memset.
> 2. Use vec_duplicate, instead of adding TARGET_READ_MEMSET_VALUE and
> TARGET_GEN_MEMSET_VALUE, to expand memset if available.
>
> Changes in the v4 patches:
>
> 1. Define x86 MAX_MOVE_MAX to 64, which is the constant maximum number
> of bytes that a single instruction can move quickly between memory and
> registers or between two memory locations.
> 2. Define x86 MOVE_MAX to MOVE_MAX_PIECES, which is the maximum number of
> bytes we can move from memory to memory in one reasonably fast instruction.
> The difference between MAX_MOVE_MAX and MOVE_MAX is that MAX_MOVE_MAX
> must be a constant, independent of compiler options, since it is used in
> reload.h to define struct target_reload and MOVE_MAX can vary, depending
> on compiler options.
>
> Changes in the v3 patches:
>
> 1. Split the TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE changes
> into the generic part and the x86 part.
>
>
> 1. Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
> target instructions to duplicate QImode value to TImode/OImode/XImode
> value for memmset.
> 2. x86: Avoid stack realignment when copying data
> 3. x86: Remov MAX_BITSIZE_MODE_ANY_INT.  Only x86 backend defines it.
> 4. x86: Use TImode/OImode/XImode integers for piecewise move and store.
> 5. x86: Add tests for TImode/OImode/XImode for piecewise move and store.
> 6. x86: Adjust existing tests.
>
> On x86-64, SPEC CPU 2017 performance impact is neutral.  Glibc code size
> differences with -O2 build are:
>
>              Before         After
> libc.so     1906572        1906444
>
> Some code sequence differences in libc.so are:
>
> <svcudp_bufcreate@GLIBC_2.2.5>:
>         ...
>         jne    <svcudp_bufcreate@GLIBC_2.2.5+0x318>           |         jne   
>  <svcudp_bufcreate@GLIBC_2.2.5+0x2a8>
>         test   %r15,%r15                                                test  
>  %r15,%r15
>         je     <svcudp_bufcreate@GLIBC_2.2.5+0x318>           |         je    
>  <svcudp_bufcreate@GLIBC_2.2.5+0x2a8>
>         mov    %r13d,(%r14)                                             mov   
>  %r13d,(%r14)
>         lea    0x10(%r14),%rdi                                          lea   
>  0x10(%r14),%rdi
>         mov    $0x1,%ecx                                                mov   
>  $0x1,%ecx
>         mov    %r13d,%edx                                               mov   
>  %r13d,%edx
>         mov    %r15,0x40(%r12)                                          mov   
>  %r15,0x40(%r12)
>         mov    %r15,%rsi                                                mov   
>  %r15,%rsi
>         call   <xdrmem_create@GLIBC_2.2.5>                              call  
>  <xdrmem_create@GLIBC_2.2.5>
>         lea    0xa2f9b(%rip),%rax        # <svcudp_op>        |         lea   
>  0xa2fab(%rip),%rax        # <svcudp_op>
>         xor    %esi,%esi                                                xor   
>  %esi,%esi
>         mov    %ebp,%edi                                                mov   
>  %ebp,%edi
>         mov    %rax,0x8(%r12)                                           mov   
>  %rax,0x8(%r12)
>         movzwl 0x12(%rsp),%eax                                          
> movzwl 0x12(%rsp),%eax
>         mov    $0x8,%edx                                      <
>         lea    0xc(%rsp),%rcx                                           lea   
>  0xc(%rsp),%rcx
>         mov    %r14,0x48(%r12)                                <
>         add    $0x40,%r14                                     <
>         mov    $0x4,%r8d                                                mov   
>  $0x4,%r8d
>                                                               >         movq  
>  $0x0,0x1d0(%r14)
>                                                               >         mov   
>  $0x8,%edx
>         rol    $0x8,%ax                                                 rol   
>  $0x8,%ax
>         mov    %ebp,(%r12)                                    |         mov   
>  %r14,0x48(%r12)
>         movq   $0x0,0x190(%r14)                               |         add   
>  $0x40,%r14
>         mov    %ax,0x4(%r12)                                  <
>         mov    %r14,0x30(%r12)                                          mov   
>  %r14,0x30(%r12)
>                                                               >         mov   
>  %ax,0x4(%r12)
>                                                               >         mov   
>  %ebp,(%r12)
>         movl   $0x1,0xc(%rsp)                                           movl  
>  $0x1,0xc(%rsp)
>         call   <setsockopt>                                             call  
>  <setsockopt>
>         mov    %r12,%rdi                                                mov   
>  %r12,%rdi
>         movabs $0x101010101010101,%rdx                        <
>         test   %eax,%eax                                                test  
>  %eax,%eax
>         mov    $0xff,%eax                                               mov   
>  $0xff,%eax
>         cmove  %eax,%ebx                                                cmove 
>  %eax,%ebx
>         movzbl %bl,%eax                                       |         movd  
>  %ebx,%xmm0
>         mov    %ebx,0xc(%rsp)                                           mov   
>  %ebx,0xc(%rsp)
>         mov    %rax,%rsi                                      |         
> punpcklbw %xmm0,%xmm0
>         imul   %rdx,%rsi                                      |         
> punpcklwd %xmm0,%xmm0
>         mul    %rdx                                           |         
> pshufd $0x0,%xmm0,%xmm0
>         add    %rsi,%rdx                                      |         
> movups %xmm0,0x50(%r12)
>         mov    %rax,0x50(%r12)                                |         
> movups %xmm0,0x60(%r12)
>         mov    %rdx,0x58(%r12)                                |         
> movups %xmm0,0x70(%r12)
>         mov    %rax,0x60(%r12)                                |         
> movups %xmm0,0x80(%r12)
>         mov    %rdx,0x68(%r12)                                |         
> movups %xmm0,0x90(%r12)
>         mov    %rax,0x70(%r12)                                |         
> movups %xmm0,0xa0(%r12)
>         mov    %rdx,0x78(%r12)                                |         
> movups %xmm0,0xb0(%r12)
>         mov    %rax,0x80(%r12)                                |         
> movups %xmm0,0xc0(%r12)
>         mov    %rdx,0x88(%r12)                                |         
> movups %xmm0,0xd0(%r12)
>         mov    %rax,0x90(%r12)                                |         
> movups %xmm0,0xe0(%r12)
>         mov    %rdx,0x98(%r12)                                |         
> movups %xmm0,0xf0(%r12)
>         mov    %rax,0xa0(%r12)                                |         
> movups %xmm0,0x100(%r12)
>         mov    %rdx,0xa8(%r12)                                |         
> movups %xmm0,0x110(%r12)
>         mov    %rax,0xb0(%r12)                                |         
> movups %xmm0,0x120(%r12)
>         mov    %rdx,0xb8(%r12)                                |         
> movups %xmm0,0x130(%r12)
>         mov    %rax,0xc0(%r12)                                |         
> movups %xmm0,0x140(%r12)
>         mov    %rdx,0xc8(%r12)                                <
>         mov    %rax,0xd0(%r12)                                <
>         mov    %rdx,0xd8(%r12)                                <
>         mov    %rax,0xe0(%r12)                                <
>         mov    %rdx,0xe8(%r12)                                <
>         mov    %rax,0xf0(%r12)                                <
>         mov    %rdx,0xf8(%r12)                                <
>         mov    %rax,0x100(%r12)                               <
>         mov    %rdx,0x108(%r12)                               <
>         mov    %rax,0x110(%r12)                               <
>         mov    %rdx,0x118(%r12)                               <
>         mov    %rax,0x120(%r12)                               <
>         mov    %rdx,0x128(%r12)                               <
>         mov    %rax,0x130(%r12)                               <
>         mov    %rdx,0x138(%r12)                               <
>         mov    %rax,0x140(%r12)                               <
>         mov    %rdx,0x148(%r12)                               <
>         call   <xprt_register@GLIBC_2.2.5>                              call  
>  <xprt_register@GLIBC_2.2.5>
>         add    $0x28,%rsp                                               add   
>  $0x28,%rsp
>         mov    %r12,%rax                                                mov   
>  %r12,%rax
>         pop    %rbx                                                     pop   
>  %rbx
>         pop    %rbp                                                     pop   
>  %rbp
>         pop    %r12                                                     pop   
>  %r12
>         pop    %r13                                                     pop   
>  %r13
>         pop    %r14                                                     pop   
>  %r14
>         pop    %r15                                                     pop   
>  %r15
>         ret                                                             ret
>
>
> *** BLURB HERE ***
>
> H.J. Lu (11):
>   Rewrite memset with TARGET_GEN_MEMSET_SCRATCH_RTX
>   x86: Add TARGET_GEN_MEMSET_SCRATCH_RTX
>   x86: Avoid stack realignment when copying data
>   x86: Update piecewise move and store
>   x86: Add AVX2 tests for PR middle-end/90773
>   x86: Add tests for piecewise move and store
>   x86: Also pass -mno-avx to pr72839.c
>   x86: Also pass -mno-avx to cold-attribute-1.c
>   x86: Also pass -mno-avx to sw-1.c for ia32
>   x86: Update gcc.target/i386/incoming-11.c
>   x86: Also pass -mno-sse to vect8-ret.c
>
>  gcc/builtins.c                                | 123 +++++++++++++++---
>  gcc/config/i386/i386-expand.c                 |   4 +-
>  gcc/config/i386/i386.c                        |  27 +++-
>  gcc/config/i386/i386.h                        |  40 ++++--
>  gcc/doc/tm.texi                               |   5 +
>  gcc/doc/tm.texi.in                            |   2 +
>  gcc/target.def                                |   7 +
>  .../gcc.target/i386/cold-attribute-1.c        |   2 +-
>  gcc/testsuite/gcc.target/i386/eh_return-1.c   |  26 ++++
>  gcc/testsuite/gcc.target/i386/incoming-11.c   |   2 +-
>  .../gcc.target/i386/pieces-memcpy-10.c        |  16 +++
>  .../gcc.target/i386/pieces-memcpy-11.c        |  17 +++
>  .../gcc.target/i386/pieces-memcpy-12.c        |  16 +++
>  .../gcc.target/i386/pieces-memcpy-13.c        |  16 +++
>  .../gcc.target/i386/pieces-memcpy-14.c        |  17 +++
>  .../gcc.target/i386/pieces-memcpy-15.c        |  16 +++
>  .../gcc.target/i386/pieces-memcpy-16.c        |  16 +++
>  .../gcc.target/i386/pieces-memcpy-7.c         |  15 +++
>  .../gcc.target/i386/pieces-memcpy-8.c         |  14 ++
>  .../gcc.target/i386/pieces-memcpy-9.c         |  14 ++
>  .../gcc.target/i386/pieces-memset-1.c         |  16 +++
>  .../gcc.target/i386/pieces-memset-10.c        |  16 +++
>  .../gcc.target/i386/pieces-memset-11.c        |  16 +++
>  .../gcc.target/i386/pieces-memset-12.c        |  16 +++
>  .../gcc.target/i386/pieces-memset-13.c        |  16 +++
>  .../gcc.target/i386/pieces-memset-14.c        |  16 +++
>  .../gcc.target/i386/pieces-memset-15.c        |  16 +++
>  .../gcc.target/i386/pieces-memset-16.c        |  16 +++
>  .../gcc.target/i386/pieces-memset-17.c        |  16 +++
>  .../gcc.target/i386/pieces-memset-18.c        |  16 +++
>  .../gcc.target/i386/pieces-memset-19.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-2.c         |  12 ++
>  .../gcc.target/i386/pieces-memset-20.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-21.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-22.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-23.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-24.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-25.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-26.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-27.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-28.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-29.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-3.c         |  18 +++
>  .../gcc.target/i386/pieces-memset-30.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-31.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-32.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-33.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-34.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-35.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-36.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-37.c        |  15 +++
>  .../gcc.target/i386/pieces-memset-38.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-39.c        |  16 +++
>  .../gcc.target/i386/pieces-memset-4.c         |  16 +++
>  .../gcc.target/i386/pieces-memset-40.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-41.c        |  16 +++
>  .../gcc.target/i386/pieces-memset-42.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-43.c        |  17 +++
>  .../gcc.target/i386/pieces-memset-5.c         |  12 ++
>  .../gcc.target/i386/pieces-memset-6.c         |  16 +++
>  .../gcc.target/i386/pieces-memset-7.c         |  16 +++
>  .../gcc.target/i386/pieces-memset-8.c         |  16 +++
>  .../gcc.target/i386/pieces-memset-9.c         |  16 +++
>  gcc/testsuite/gcc.target/i386/pr100865-1.c    |   2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-10a.c  |   4 +-
>  gcc/testsuite/gcc.target/i386/pr100865-10b.c  |   4 +-
>  gcc/testsuite/gcc.target/i386/pr100865-2.c    |   2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-3.c    |   2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-4a.c   |   6 +-
>  gcc/testsuite/gcc.target/i386/pr100865-4b.c   |   8 +-
>  gcc/testsuite/gcc.target/i386/pr72839.c       |   2 +-
>  gcc/testsuite/gcc.target/i386/pr90773-1.c     |  10 +-
>  gcc/testsuite/gcc.target/i386/pr90773-14.c    |   4 +-
>  gcc/testsuite/gcc.target/i386/pr90773-15.c    |  14 ++
>  gcc/testsuite/gcc.target/i386/pr90773-16.c    |  14 ++
>  gcc/testsuite/gcc.target/i386/pr90773-17.c    |  14 ++
>  gcc/testsuite/gcc.target/i386/pr90773-18.c    |  15 +++
>  gcc/testsuite/gcc.target/i386/pr90773-19.c    |  14 ++
>  gcc/testsuite/gcc.target/i386/pr90773-20.c    |  13 ++
>  gcc/testsuite/gcc.target/i386/pr90773-21.c    |  13 ++
>  gcc/testsuite/gcc.target/i386/pr90773-22.c    |  13 ++
>  gcc/testsuite/gcc.target/i386/pr90773-23.c    |  13 ++
>  gcc/testsuite/gcc.target/i386/pr90773-24.c    |   2 +-
>  gcc/testsuite/gcc.target/i386/pr90773-25.c    |   2 +-
>  gcc/testsuite/gcc.target/i386/pr90773-26.c    |  21 +++
>  gcc/testsuite/gcc.target/i386/pr90773-4.c     |   2 +-
>  gcc/testsuite/gcc.target/i386/sw-1.c          |   1 +
>  gcc/testsuite/gcc.target/i386/vect8-ret.c     |   2 +-
>  88 files changed, 1231 insertions(+), 64 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/eh_return-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-10.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-11.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-12.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-13.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-14.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-15.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-16.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-17.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-18.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-19.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-20.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-21.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-22.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-23.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-24.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-25.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-26.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-27.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-28.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-29.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-30.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-31.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-32.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-33.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-34.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-35.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-36.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-37.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-38.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-39.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-40.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-41.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-42.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-43.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-5.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-6.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-7.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-8.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-9.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-15.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-16.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-17.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-18.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-19.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-20.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-21.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-22.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-23.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-26.c
>
> --
> 2.31.1
>

PING.

--
H.J.

Reply via email to