Thank you for suggesting new GCD and sharing benchmark results from two
compilers!
`gcd_impulse` is the fastest with Clang, and the slowest with GCC, looks
strange. So I compared generated assembly code.
* Clang version 18.1.8
* GCC version 13.3.1
`var u = u shr u.countTrailingZeroBits()` is compiled to
rep bsf rcx, rdi
sar rdi, cl
Run
on Clang,
xor ecx, ecx
mov rax, rdi
rep bsf rcx, rdi
sar rax, cl
Run
on GCC.
GCC zero clears destination operand (first operand of bsf) and copys source
operand before counting bits. I don't think it need to do so.
Clang uses conditional move (cmovle and cmovge) but GCC uses conditional jump.
That might also makes Clang generated code faster.
`gcd_impulse` assembly code from Clang:
gcd_impulse__test_u77:
.cfi_startproc
# %bb.0:
mov rax, rsi
test rdi, rdi
je .LBB7_3
# %bb.1:
test rax, rax
je .LBB7_2
# %bb.4:
mov rcx, qword ptr fs:[0]
add rcx, qword ptr [rip + nimInErrorMode__system_u4340@GOTTPOFF]
cmp byte ptr [rcx], 0
jne .LBB7_5
# %bb.6:
mov rcx, rax
or rcx, rdi
rep bsf rdx, rcx
rep bsf rcx, rdi
# kill: def $cl killed $cl killed
$rcx
sar rdi, cl
.p2align 4, 0x90
.LBB7_7: # =>This Inner Loop Header: Depth=1
mov rsi, rdi
rep bsf rcx, rax
# kill: def $cl killed $cl killed
$rcx
sar rax, cl
mov rcx, rax
mov rdi, rax
sub rax, rsi
neg rax
sub rcx, rsi
cmovle rcx, rax
cmovge rdi, rsi
mov rax, rcx
test rcx, rcx
jne .LBB7_7
# %bb.8:
mov ecx, edx
shl rdi, cl
.LBB7_2:
mov rax, rdi
.LBB7_3:
ret
.LBB7_5:
xor eax, eax
ret
Run
Assembly code from GCC:
gcd_impulse__mratsim95gcd_u77:
endbr64
mov rax, rsi
test rdi, rdi
je .L71
mov rax, rdi
test rsi, rsi
je .L71
mov rax, QWORD PTR nimInErrorMode__system_u4340@gottpoff[rip]
mov rcx, rdi
xor r8d, r8d
or rcx, rsi
rep bsf r8, rcx
cmp BYTE PTR fs:[rax], 0
jne .L79
xor ecx, ecx
mov rax, rdi
rep bsf rcx, rdi
sar rax, cl
.L75:
xor ecx, ecx
mov rdx, rsi
rep bsf rcx, rsi
sar rdx, cl
cmp rdx, rax
jge .L82
.L73:
sub rax, rdx
xor ecx, ecx
mov rsi, rax
mov rax, rdx
rep bsf rcx, rsi
mov rdx, rsi
sar rdx, cl
cmp rdx, rax
jl .L73
.L82:
mov rsi, rdx
sub rsi, rax
jne .L75
mov ecx, r8d
sal rax, cl
ret
.p2align 4,,10
.p2align 3
.L71:
ret
.p2align 4,,10
.p2align 3
.L79:
.L72:
xor eax, eax
ret
Run