https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110104

            Bug ID: 110104
           Summary: gcc produces sub-optimal code for _addcarry_u64 chain
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: slash.tmp at free dot fr
  Target Milestone: ---

Consider the following code:

#include <x86intrin.h>
typedef unsigned long long u64;
typedef unsigned __int128 u128;
void testcase1(u64 *acc, u64 a, u64 b)
{
  u128 res = (u128)a*b;
  u64 lo = res, hi = res >> 64;
  unsigned char cf = 0;
  cf = _addcarry_u64(cf, lo, acc[0], acc+0);
  cf = _addcarry_u64(cf, hi, acc[1], acc+1);
  cf = _addcarry_u64(cf,  0, acc[2], acc+2);
}
void testcase2(u64 *acc, u64 a, u64 b)
{
  u128 res = (u128)a * b;
  u64 lo = res, hi = res >> 64;
  asm("add %[LO], %[D0]\n\t" "adc %[HI], %[D1]\n\t" "adc $0, %[D2]" :
  [D0] "+m" (acc[0]), [D1] "+m" (acc[1]), [D2] "+m" (acc[2]) :
  [LO] "r" (lo), [HI] "r" (hi) : "cc");
}

Compilation with either
gcc-trunk -Wall -Wextra -O3 -S testcase.c
gcc-trunk -Wall -Wextra -Os -S testcase.c
generate the same code:

// rdi = acc, rsi = a, rdx = b

testcase1:
  movq %rsi, %rax
  mulq %rdx
  addq %rax, (%rdi)
  movq %rdx, %rax
  adcq 8(%rdi), %rax
  adcq $0, 16(%rdi)
  movq %rax, 8(%rdi)
  ret

testcase2:
  movq %rsi, %rax       ; rax = rsi = a
  mulq %rdx             ; rdx:rax = rax*rdx = a*b
  add %rax, (%rdi)      ; acc[0] += lo
  adc %rdx, 8(%rdi)     ; acc[1] += hi + cf
  adc $0, 16(%rdi)      ; acc[2] += cf
  ret


Conclusion:
gcc generates the expected code for testcase2.
However, the code for testcase1 is sub-optimal.

  movq %rdx, %rax
  adcq 8(%rdi), %rax
  movq %rax, 8(%rdi)

instead of

  adc %rdx, 8(%rdi)     ; acc[1] += hi + cf


The copy of rdx to rax is useless.
The (load/add+store) ops can be merged into an load/add/store op.

Reply via email to