| Issue |
164275
|
| Summary |
[X86] Investigate CTTZ large _BItInt codegen
|
| Labels |
backend:X86,
missed-optimization
|
| Assignees |
|
| Reporter |
RKSimon
|
https://clang.godbolt.org/z/6PYdq6YGY
```c
uint32_t tzcnt512(const unsigned _BitInt(512)& word) {
return __builtin_ctzg(word, 512);
}
```
Results in a large chain of TZCNT instructions:
```asm
tzcnt512(unsigned _BitInt(512) const&):
pushq %r14
pushq %rbx
movq 48(%rdi), %r11
movq 40(%rdi), %r9
movq 32(%rdi), %r10
movq 24(%rdi), %r8
movq 16(%rdi), %rdx
movq (%rdi), %rcx
movq 8(%rdi), %rsi
tzcntq %rcx, %rax
tzcntq %rsi, %r14
addl $64, %r14d
testq %rcx, %rcx
cmovnel %eax, %r14d
tzcntq %rdx, %rax
tzcntq %r8, %rbx
addl $64, %ebx
testq %rdx, %rdx
cmovnel %eax, %ebx
subl $-128, %ebx
movq %rcx, %rax
orq %rsi, %rax
cmovnel %r14d, %ebx
tzcntq %r10, %rax
tzcntq %r9, %r14
addl $64, %r14d
testq %r10, %r10
cmovnel %eax, %r14d
tzcntq 56(%rdi), %rax
tzcntq %r11, %rdi
addl $64, %eax
testq %r11, %r11
cmovnel %edi, %eax
subl $-128, %eax
orq %r9, %r10
cmovnel %r14d, %eax
addl $256, %eax
orq %r8, %rsi
orq %rdx, %rcx
orq %rsi, %rcx
cmovnel %ebx, %eax
popq %rbx
popq %r14
retq
```
Which might have potential for optimization (should we scan the QWORDS and find only perform TZCNT once on the lowest non-zero QWORD?).
AVX512 targets could potentially move to:
```c
uint32_t tzcnt512(const unsigned _BitInt(512)& word) {
__m512i vec = __builtin_bit_cast(__m512i, word);
__mmask8 zz = _mm512_cmpneq_epi64_mask(vec, _mm512_setzero_si512());
__m512i elttz = __builtin_elementwise_ctzg((__v8di)vec, (__v8di)_mm512_set1_epi64(64));
elttz = _mm512_add_epi64(elttz, _mm512_setr_epi64(0* 64, 1* 64, 2* 64, 3* 64, 4* 64, 5* 64, 6* 64, 7* 64));
uint32_t tz = (uint32_t)_mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_mask_compress_epi64(_mm512_set1_epi64(512), zz, elttz)));
return tz;
}
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs