Issue 164275
Summary [X86] Investigate CTTZ large _BItInt codegen
Labels backend:X86, missed-optimization
Assignees
Reporter RKSimon
    https://clang.godbolt.org/z/6PYdq6YGY
```c
uint32_t tzcnt512(const unsigned _BitInt(512)& word) {
    return __builtin_ctzg(word, 512);
}
```
Results in a large chain of TZCNT instructions:

```asm
tzcnt512(unsigned _BitInt(512) const&):
 pushq   %r14
        pushq   %rbx
        movq    48(%rdi), %r11
 movq    40(%rdi), %r9
        movq    32(%rdi), %r10
        movq 24(%rdi), %r8
        movq    16(%rdi), %rdx
        movq    (%rdi), %rcx
 movq    8(%rdi), %rsi
        tzcntq  %rcx, %rax
        tzcntq %rsi, %r14
        addl    $64, %r14d
        testq   %rcx, %rcx
 cmovnel %eax, %r14d
        tzcntq  %rdx, %rax
        tzcntq  %r8, %rbx
 addl    $64, %ebx
        testq   %rdx, %rdx
        cmovnel %eax, %ebx
        subl    $-128, %ebx
        movq    %rcx, %rax
        orq %rsi, %rax
        cmovnel %r14d, %ebx
        tzcntq  %r10, %rax
 tzcntq  %r9, %r14
        addl    $64, %r14d
        testq   %r10, %r10
 cmovnel %eax, %r14d
        tzcntq  56(%rdi), %rax
        tzcntq %r11, %rdi
        addl    $64, %eax
        testq   %r11, %r11
 cmovnel %edi, %eax
        subl    $-128, %eax
        orq     %r9, %r10
 cmovnel %r14d, %eax
        addl    $256, %eax
        orq     %r8, %rsi
        orq     %rdx, %rcx
        orq     %rsi, %rcx
        cmovnel %ebx, %eax
        popq    %rbx
        popq    %r14
 retq
```

Which might have potential for optimization (should we scan the QWORDS and find only perform TZCNT once on the lowest non-zero QWORD?).

AVX512 targets could potentially move to:
```c
uint32_t tzcnt512(const unsigned _BitInt(512)& word) {
    __m512i vec = __builtin_bit_cast(__m512i, word);
    __mmask8 zz = _mm512_cmpneq_epi64_mask(vec, _mm512_setzero_si512());
    __m512i elttz = __builtin_elementwise_ctzg((__v8di)vec, (__v8di)_mm512_set1_epi64(64));
 elttz = _mm512_add_epi64(elttz, _mm512_setr_epi64(0* 64, 1* 64, 2* 64, 3* 64, 4* 64, 5* 64, 6* 64, 7* 64));
    uint32_t tz = (uint32_t)_mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_mask_compress_epi64(_mm512_set1_epi64(512), zz, elttz)));
    return tz;
}
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to