[Bug target/125856] New: x86 gcc16 codesize regression related to memcpy inline caused by r16-2047-g401199377c5004

liuhongt at gcc dot gnu.org via Gcc-bugs Wed, 17 Jun 2026 01:16:11 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125856


            Bug ID: 125856
           Summary: x86 gcc16 codesize regression related to memcpy inline
                    caused by r16-2047-g401199377c5004
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: liuhongt at gcc dot gnu.org
  Target Milestone: ---

cat test.c

#include <string.h>
void bounded_copy(char *dst, const char *src, unsigned long n) {
   if (n <= 15)
       memcpy(dst, src, n);
}

GCC15.1 -O2 generates

bounded_copy(char*, char const*, unsigned long):
        cmpq    $15, %rdx
        jbe     .L4
        ret
.L4:
        jmp     memcpy

GCC16.1 -O2 generates

bounded_copy(char*, char const*, unsigned long):
        cmpq    $15, %rdx
        jbe     .L17
.L1:
        ret
.L17:
        cmpl    $64, %edx
        jnb     .L3
        testb   $32, %dl
        jne     .L18
        testb   $16, %dl
        jne     .L19
        testb   $8, %dl
        jne     .L20
        testb   $4, %dl
        jne     .L21
        testl   %edx, %edx
        je      .L1
        movzbl  (%rsi), %eax
        movb    %al, (%rdi)
        testb   $2, %dl
        je      .L1
        movl    %edx, %edx
        movzwl  -2(%rsi,%rdx), %eax
        movw    %ax, -2(%rdi,%rdx)
        ret
.L3:
        movl    %edx, %eax
        subl    $1, %edx
        leaq    (%rdi,%rax), %rcx
        addq    %rsi, %rax
        movdqu  -64(%rax), %xmm0
        movups  %xmm0, -64(%rcx)
        movdqu  -48(%rax), %xmm0
        movups  %xmm0, -48(%rcx)
        movdqu  -32(%rax), %xmm0
        movups  %xmm0, -32(%rcx)
        movdqu  -16(%rax), %xmm0
        movups  %xmm0, -16(%rcx)
        cmpl    $64, %edx
        jb      .L1
        andl    $-64, %edx
        xorl    %ecx, %ecx
.L10:
        movl    %ecx, %eax
        addl    $64, %ecx
        movdqu  (%rsi,%rax), %xmm3
        movdqu  16(%rsi,%rax), %xmm2
        movdqu  32(%rsi,%rax), %xmm1
        movdqu  48(%rsi,%rax), %xmm0
        movups  %xmm3, (%rdi,%rax)
        movups  %xmm2, 16(%rdi,%rax)
        movups  %xmm1, 32(%rdi,%rax)
        movups  %xmm0, 48(%rdi,%rax)
        cmpl    %edx, %ecx
        jb      .L10
        ret
.L18:
        movdqu  (%rsi), %xmm0
        movl    %edx, %edx
        leaq    32(%rdi,%rdx), %rax
        leaq    32(%rsi,%rdx), %rdx
        movups  %xmm0, (%rdi)
        movdqu  16(%rsi), %xmm0
        movups  %xmm0, 16(%rdi)
        movdqu  -64(%rdx), %xmm0
        movups  %xmm0, -64(%rax)
        movdqu  -48(%rdx), %xmm0
        movups  %xmm0, -48(%rax)
        ret
.L19:
        movdqu  (%rsi), %xmm0
        movl    %edx, %edx
        movups  %xmm0, (%rdi)
        movdqu  -16(%rsi,%rdx), %xmm0
        movups  %xmm0, -16(%rdi,%rdx)
        ret
.L20:
        movq    (%rsi), %rax
        movl    %edx, %edx
        movq    %rax, (%rdi)
        movq    -8(%rsi,%rdx), %rax
        movq    %rax, -8(%rdi,%rdx)
        ret
.L21:
        movl    (%rsi), %eax
        movl    %edx, %edx
        movl    %eax, (%rdi)
        movl    -4(%rsi,%rdx), %eax
        movl    %eax, -4(%rdi,%rdx)
        ret

I think for this case, we probably should keep libcall instead of inline it.

[Bug target/125856] New: x86 gcc16 codesize regression related to memcpy inline caused by r16-2047-g401199377c5004

Reply via email to