https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120683

--- Comment #5 from GCC Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by H.J. Lu <h...@gcc.gnu.org>:

https://gcc.gnu.org/g:401199377c50045ede560daf3f6e8b51749c2a87

commit r16-2047-g401199377c50045ede560daf3f6e8b51749c2a87
Author: H.J. Lu <hjl.to...@gmail.com>
Date:   Tue Jun 17 10:17:17 2025 +0800

    x86: Improve vector_loop/unrolled_loop for memset/memcpy

    1. Don't generate the loop if the loop count is 1.
    2. For memset with vector on small size, use vector if small size supports
    vector, otherwise use the scalar value.
    3. Always expand vector-version of memset for vector_loop.
    4. Always duplicate the promoted scalar value for vector_loop if not 0 nor
    -1.
    5. Use misaligned prologue if alignment isn't needed.  When misaligned
    prologue is used, check if destination is actually aligned and update
    destination alignment if aligned.
    6. Use move_by_pieces and store_by_pieces for memcpy and memset epilogues
    with the fixed epilogue size to enable overlapping moves and stores.

    The included tests show that codegen of vector_loop/unrolled_loop for
    memset/memcpy are significantly improved.  For

    void
    foo (void *p1, size_t len)
    {
      __builtin_memset (p1, 0, len);
    }

    with

    -O2 -minline-all-stringops
-mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -march=x86-64

    we used to generate

    foo:
    .LFB0:
            .cfi_startproc
            movq    %rdi, %rax
            pxor    %xmm0, %xmm0
            cmpq    $64, %rsi
            jnb     .L18
    .L2:
            andl    $63, %esi
            je      .L1
            xorl    %edx, %edx
            testb   $1, %sil
            je      .L5
            movl    $1, %edx
            movb    $0, (%rax)
            cmpq    %rsi, %rdx
            jnb     .L19
    .L5:
            movb    $0, (%rax,%rdx)
            movb    $0, 1(%rax,%rdx)
            addq    $2, %rdx
            cmpq    %rsi, %rdx
            jb      .L5
    .L1:
            ret
            .p2align 4,,10
            .p2align 3
    .L18:
            movq    %rsi, %rdx
            xorl    %eax, %eax
            andq    $-64, %rdx
    .L3:
            movups  %xmm0, (%rdi,%rax)
            movups  %xmm0, 16(%rdi,%rax)
            movups  %xmm0, 32(%rdi,%rax)
            movups  %xmm0, 48(%rdi,%rax)
            addq    $64, %rax
            cmpq    %rdx, %rax
            jb      .L3
            addq    %rdi, %rax
            jmp     .L2
    .L19:
            ret
            .cfi_endproc

    with very poor prologue/epilogue.  With this patch, we now generate:

    foo:
    .LFB0:
            .cfi_startproc
            pxor    %xmm0, %xmm0
            cmpq    $64, %rsi
            jnb     .L2
            testb   $32, %sil
            jne     .L19
            testb   $16, %sil
            jne     .L20
            testb   $8, %sil
            jne     .L21
            testb   $4, %sil
            jne     .L22
            testq   %rsi, %rsi
            jne     .L23
    .L1:
            ret
            .p2align 4,,10
            .p2align 3
    .L2:
            movups  %xmm0, -64(%rdi,%rsi)
            movups  %xmm0, -48(%rdi,%rsi)
            movups  %xmm0, -32(%rdi,%rsi)
            movups  %xmm0, -16(%rdi,%rsi)
            subq    $1, %rsi
            cmpq    $64, %rsi
            jb      .L1
            andq    $-64, %rsi
            xorl    %eax, %eax
    .L9:
            movups  %xmm0, (%rdi,%rax)
            movups  %xmm0, 16(%rdi,%rax)
            movups  %xmm0, 32(%rdi,%rax)
            movups  %xmm0, 48(%rdi,%rax)
            addq    $64, %rax
            cmpq    %rsi, %rax
            jb      .L9
            ret
            .p2align 4,,10
            .p2align 3
    .L23:
            movb    $0, (%rdi)
            testb   $2, %sil
            je      .L1
            xorl    %eax, %eax
            movw    %ax, -2(%rdi,%rsi)
            ret
            .p2align 4,,10
            .p2align 3
    .L19:
            movups  %xmm0, (%rdi)
            movups  %xmm0, 16(%rdi)
            movups  %xmm0, -32(%rdi,%rsi)
            movups  %xmm0, -16(%rdi,%rsi)
            ret
            .p2align 4,,10
            .p2align 3
    .L20:
            movups  %xmm0, (%rdi)
            movups  %xmm0, -16(%rdi,%rsi)
            ret
            .p2align 4,,10
            .p2align 3
    .L21:
            movq    $0, (%rdi)
            movq    $0, -8(%rdi,%rsi)
            ret
            .p2align 4,,10
            .p2align 3
    .L22:
            movl    $0, (%rdi)
            movl    $0, -4(%rdi,%rsi)
            ret
            .cfi_endproc

    gcc/

            PR target/120670
            PR target/120683
            * config/i386/i386-expand.cc (expand_set_or_cpymem_via_loop):
            Don't generate the loop if the loop count is 1.
            (expand_cpymem_epilogue): Use move_by_pieces.
            (setmem_epilogue_gen_val): New.
            (expand_setmem_epilogue): Use store_by_pieces.
            (expand_small_cpymem_or_setmem): Choose cpymem mode from MOVE_MAX.
            For memset with vector and the size is smaller than the vector
            size, first try the narrower vector, otherwise, use the scalar
            value.
            (promote_duplicated_reg): Duplicate the scalar value for vector.
            (ix86_expand_set_or_cpymem): Always expand vector-version of
            memset for vector_loop.  Use misaligned prologue if alignment
            isn't needed and destination isn't aligned.  Always initialize
            vec_promoted_val from the promoted scalar value for vector_loop.

    gcc/testsuite/

            PR target/120670
            PR target/120683
            * gcc.target/i386/auto-init-padding-9.c: Updated.
            * gcc.target/i386/memcpy-strategy-12.c: Likewise.
            * gcc.target/i386/memset-strategy-25.c: Likewise.
            * gcc.target/i386/memset-strategy-29.c: Likewise.
            * gcc.target/i386/memset-strategy-30.c: Likewise.
            * gcc.target/i386/memset-strategy-31.c: Likewise.
            * gcc.target/i386/memcpy-pr120683-1.c: New test.
            * gcc.target/i386/memcpy-pr120683-2.c: Likewise.
            * gcc.target/i386/memcpy-pr120683-3.c: Likewise.
            * gcc.target/i386/memcpy-pr120683-4.c: Likewise.
            * gcc.target/i386/memcpy-pr120683-5.c: Likewise.
            * gcc.target/i386/memcpy-pr120683-6.c: Likewise.
            * gcc.target/i386/memcpy-pr120683-7.c: Likewise.
            * gcc.target/i386/memset-pr120683-1.c: Likewise.
            * gcc.target/i386/memset-pr120683-2.c: Likewise.
            * gcc.target/i386/memset-pr120683-3.c: Likewise.
            * gcc.target/i386/memset-pr120683-4.c: Likewise.
            * gcc.target/i386/memset-pr120683-5.c: Likewise.
            * gcc.target/i386/memset-pr120683-6.c: Likewise.
            * gcc.target/i386/memset-pr120683-7.c: Likewise.
            * gcc.target/i386/memset-pr120683-8.c: Likewise.
            * gcc.target/i386/memset-pr120683-9.c: Likewise.
            * gcc.target/i386/memset-pr120683-10.c: Likewise.
            * gcc.target/i386/memset-pr120683-11.c: Likewise.
            * gcc.target/i386/memset-pr120683-12.c: Likewise.
            * gcc.target/i386/memset-pr120683-13.c: Likewise.
            * gcc.target/i386/memset-pr120683-14.c: Likewise.
            * gcc.target/i386/memset-pr120683-15.c: Likewise.
            * gcc.target/i386/memset-pr120683-16.c: Likewise.
            * gcc.target/i386/memset-pr120683-17.c: Likewise.
            * gcc.target/i386/memset-pr120683-18.c: Likewise.
            * gcc.target/i386/memset-pr120683-19.c: Likewise.
            * gcc.target/i386/memset-pr120683-20.c: Likewise.
            * gcc.target/i386/memset-pr120683-21.c: Likewise.
            * gcc.target/i386/memset-pr120683-22.c: Likewise.
            * gcc.target/i386/memset-pr120683-23.c: Likewise.

Reply via email to