https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112584

            Bug ID: 112584
           Summary: Suboptimal stack usage on third memcpy
           Product: gcc
           Version: 13.2.1
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: antoshkka at gmail dot com
  Target Milestone: ---

Consider the example:


struct string_view {
    const char* data;
    unsigned long size;
};

void AppendToCharArray(char*& data, string_view s1, string_view s2, string_view
s3) {
  __builtin_memcpy(data, s1.data, s1.size);
  data += s1.size;

  __builtin_memcpy(data, s2.data, s2.size);
  data += s2.size;

  __builtin_memcpy(data, s3.data, s3.size);
  data += s3.size;
}


With -O2 it generates an assembly with 6 push and 6 pop instructions. However,
there's a better assembly possible:

  push r15
  push r14
  push r12
  push rbx
  push rax
  mov rbx, r8
  mov r14, rcx
  mov r15, rdx
  mov r12, rdi
  mov rdi, qword ptr [rdi]
  call memcpy
  add r15, qword ptr [r12]
  mov qword ptr [r12], r15
  mov rdi, r15
  mov rsi, r14
  mov rdx, rbx
  call memcpy
  add rbx, qword ptr [r12]
  mov qword ptr [r12], rbx
  mov rsi, qword ptr [rsp + 48]
  mov r14, qword ptr [rsp + 56]
  mov rdi, rbx
  mov rdx, r14
  call memcpy
  add qword ptr [r12], r14
  add rsp, 8
  pop rbx
  pop r12
  pop r14
  pop r15
  ret

Godbolt playground: https://godbolt.org/z/EY8E1GGPz

Reply via email to