https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117860
Bug ID: 117860
Summary: GCC emits an unnecessary mov for x86
_addcarry/_subborrow intrinsic calls where the second
operand is a constant that is within the range of a
32-bit integer
Product: gcc
Version: 14.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: john_platts at hotmail dot com
Target Milestone: ---
Here is a snippet of C99/C++ code where GCC fails to optimize the adc/sbb down
to a single instruction in the case where the second adc/sbb operand (or the
third argument of the _addcarry_u32/_addcarry_u64/_subborrow_u32/_subborrow_u64
intrinsics) is known to be constant and within the range of a 32-bit integer:
#include <stdint.h>
#if (defined(__GNUC__) || defined(__clang__))
#include <immintrin.h>
#elif defined(_MSC_VER)
#include <intrin.h>
#endif
typedef struct {
uint64_t lo64;
uint64_t mid64;
uint64_t hi64;
} UInt192;
UInt192 SomeAddFunc(uint64_t a_lo, uint64_t a_hi, uint64_t b) {
UInt192 result;
unsigned char cf;
unsigned long long sum;
cf = _addcarry_u64(0, a_lo, b, &sum);
result.lo64 = sum;
cf = _addcarry_u64(cf, a_hi, 5, &sum);
result.mid64 = sum;
result.hi64 = cf;
return result;
}
UInt192 SomeSubFunc(uint64_t a_lo, uint64_t a_hi, uint64_t b) {
UInt192 result;
unsigned char cf;
unsigned long long diff;
cf = _subborrow_u64(0, a_lo, b, &diff);
result.lo64 = diff;
cf = _subborrow_u64(cf, a_hi, 17, &diff);
result.mid64 = diff;
(void)_subborrow_u64(cf, 0, 0, &diff);
result.hi64 = diff;
return result;
}
Here is the code that GCC 14.2.0 generates for the above snippet with the -O2
option:
SomeAddFunc:
add rsi, rcx
mov ecx, 5
mov rax, rdi
adc rdx, rcx
movq xmm0, rsi
movq xmm1, rdx
setc dl
punpcklqdq xmm0, xmm1
movzx edx, dl
mov QWORD PTR [rdi+16], rdx
movups XMMWORD PTR [rdi], xmm0
ret
SomeSubFunc:
sub rsi, rcx
mov ecx, 17
mov rax, rdi
sbb rdx, rcx
movq xmm0, rsi
movq xmm1, rdx
sbb rdx, rdx
punpcklqdq xmm0, xmm1
mov QWORD PTR [rdi+16], rdx
movups XMMWORD PTR [rdi], xmm0
ret
In the SomeAddFunc code that is generated by GCC 14.2.0, GCC fails to optimize
the following instructions down to adc rdx, 5 when optimizations are enabled:
mov ecx, 5
adc rdx, rcx
Likewise, in the SomeSubFunc code that is generated by GCC 14.2.0, GCC fails to
optimize the following instructions down to sbb rdx, 17 when optimizations are
enabled:
mov ecx, 17
sbb rdx, rcx
A demonstration of the above snippet being compiled with GCC 14.2.0, Clang
19.1.0, and MSVC v19.40 can be found at https://godbolt.org/z/zW8WToP5G.