--- failure.c ---
int _clz(unsigned long long argument) {
    return __builtin_clzll(argument);
}

int _ctz(unsigned long long argument) {
    return __builtin_ctzll(argument);
}
--- EOF ---

GCC 13.1    -m32 -mabm -mbmi -mlzcnt -O3 failure.c

<https://godbolt.org/z/MMf11hKch>
_clz(unsigned long long):
        mov     edx, DWORD PTR [esp+8]
        xor     ecx, ecx
        xor     eax, eax
        lzcnt   eax, DWORD PTR [esp+4]
        add     eax, 32
        lzcnt   ecx, edx
        test    edx, edx
        cmovne  eax, ecx
        ret
_ctz(unsigned long long):
        sub     esp, 20
        push    DWORD PTR [esp+28]
        push    DWORD PTR [esp+28]
        call    __ctzdi2
        add     esp, 28
        ret

OUCH: although EXPLICITLY enabled via -mabm (for AMD processors) and -mbmi
      (for Intel processors), GCC generates slowmotion code calling __ctzdi2()
      instead of TZCNT instructions available since 10 (in words: TEN) years.


GCC 13.1    -m32 -march=i386 -O3 failure.c

<https://godbolt.org/z/16ezfaexb>
_clz(unsigned long long):
        mov     edx, DWORD PTR [esp+4]
        mov     eax, DWORD PTR [esp+8]
        test    eax, eax
        je      .L2
        bsr     eax, eax
        xor     eax, 31
        ret
.L2:
        bsr     eax, edx
        xor     eax, 31
        lea     eax, [eax+32]
        ret
_ctz(unsigned long long):
        sub     esp, 20
        push    DWORD PTR [esp+28]
        push    DWORD PTR [esp+28]
        call    __ctzdi2
        add     esp, 28
        ret

OUCH²: the BSF/BSR instructions were introduced 38 (in words: THIRTY-EIGHT)
       years ago with the i386 processor, but GCC fails to know/use BSF --
       a real shame!

OUCH³: an optimising compiler would of course generate "JMP __ctzdi2" instead
       of code fiddling with the stack!

Stefan Kanthak

Reply via email to