Issue 149298
Summary clang missed vectorization optization
Labels clang
Assignees
Reporter rockeet
    ```c++
#include <string.h>
#include <utility>
#define extent(a) sizeof(a)/sizeof(a[0])
struct alignas(16) B {
    int a[12];
};
void B_swap1(B* x, B* y) {
    B t;
    memcpy(&t,  x, sizeof(B));
    memcpy( x,  y, sizeof(B));
    memcpy( y, &t, sizeof(B));
}
void B_swap2(B* x, B* y) {
    for (int i = 0; i < extent(x->a); i++) {
 std::swap(x->a[i], y->a[i]);
    }
}
```
### clang generates (-O3)
```nasm
B_swap1(B*, B*):
        movaps  xmm0, xmmword ptr [rdi]
 movaps  xmm1, xmmword ptr [rdi + 16]
        movaps  xmm2, xmmword ptr [rdi + 32]
        movaps  xmmword ptr [rsp - 24], xmm2
        movaps xmmword ptr [rsp - 40], xmm1
        movaps  xmmword ptr [rsp - 56], xmm0
 movaps  xmm0, xmmword ptr [rsi]
        movaps  xmm1, xmmword ptr [rsi + 16]
        movaps  xmm2, xmmword ptr [rsi + 32]
        movaps  xmmword ptr [rdi + 32], xmm2
        movaps  xmmword ptr [rdi + 16], xmm1
 movaps  xmmword ptr [rdi], xmm0
        movaps  xmm0, xmmword ptr [rsp - 56]
        movaps  xmm1, xmmword ptr [rsp - 40]
        movaps  xmm2, xmmword ptr [rsp - 24]
        movaps  xmmword ptr [rsi + 32], xmm2
 movaps  xmmword ptr [rsi + 16], xmm1
        movaps  xmmword ptr [rsi], xmm0
        ret

B_swap2(B*, B*):
        mov     eax, dword ptr [rdi]
 mov     ecx, dword ptr [rsi]
        mov     dword ptr [rdi], ecx
 mov     dword ptr [rsi], eax
        mov     eax, dword ptr [rdi + 4]
 mov     ecx, dword ptr [rsi + 4]
        mov     dword ptr [rdi + 4], ecx
        mov     dword ptr [rsi + 4], eax
        mov     eax, dword ptr [rdi + 8]
        mov     ecx, dword ptr [rsi + 8]
        mov     dword ptr [rdi + 8], ecx
        mov     dword ptr [rsi + 8], eax
        mov eax, dword ptr [rdi + 12]
        mov     ecx, dword ptr [rsi + 12]
 mov     dword ptr [rdi + 12], ecx
        mov     dword ptr [rsi + 12], eax
        mov     eax, dword ptr [rdi + 16]
        mov     ecx, dword ptr [rsi + 16]
        mov     dword ptr [rdi + 16], ecx
        mov dword ptr [rsi + 16], eax
        mov     eax, dword ptr [rdi + 20]
 mov     ecx, dword ptr [rsi + 20]
        mov     dword ptr [rdi + 20], ecx
        mov     dword ptr [rsi + 20], eax
        mov     eax, dword ptr [rdi + 24]
        mov     ecx, dword ptr [rsi + 24]
        mov dword ptr [rdi + 24], ecx
        mov     dword ptr [rsi + 24], eax
 mov     eax, dword ptr [rdi + 28]
        mov     ecx, dword ptr [rsi + 28]
        mov     dword ptr [rdi + 28], ecx
        mov     dword ptr [rsi + 28], eax
        mov     eax, dword ptr [rdi + 32]
        mov ecx, dword ptr [rsi + 32]
        mov     dword ptr [rdi + 32], ecx
 mov     dword ptr [rsi + 32], eax
        mov     eax, dword ptr [rdi + 36]
        mov     ecx, dword ptr [rsi + 36]
        mov     dword ptr [rdi + 36], ecx
        mov     dword ptr [rsi + 36], eax
        mov eax, dword ptr [rdi + 40]
        mov     ecx, dword ptr [rsi + 40]
 mov     dword ptr [rdi + 40], ecx
        mov     dword ptr [rsi + 40], eax
        mov     eax, dword ptr [rdi + 44]
        mov     ecx, dword ptr [rsi + 44]
        mov     dword ptr [rdi + 44], ecx
        mov dword ptr [rsi + 44], eax
```

### g++ generates (-O3)
```nasm
B_swap1(B*, B*):
        movdqu  xmm3, XMMWORD PTR [rsi]
 movdqu  xmm2, XMMWORD PTR [rdi]
        movdqu  xmm1, XMMWORD PTR [rdi+16]
        movdqu  xmm0, XMMWORD PTR [rdi+32]
        movups  XMMWORD PTR [rdi], xmm3
        movdqu  xmm3, XMMWORD PTR [rsi+16]
        movups XMMWORD PTR [rdi+16], xmm3
        movdqu  xmm3, XMMWORD PTR [rsi+32]
 movups  XMMWORD PTR [rdi+32], xmm3
        movups  XMMWORD PTR [rsi], xmm2
        movups  XMMWORD PTR [rsi+16], xmm1
        movups  XMMWORD PTR [rsi+32], xmm0
        ret
B_swap2(B*, B*):
        movdqa  xmm0, XMMWORD PTR [rdi]
        movdqa  xmm1, XMMWORD PTR [rsi]
        movaps  XMMWORD PTR [rdi], xmm1
        movdqa  xmm1, XMMWORD PTR [rsi+16]
        movaps XMMWORD PTR [rsi], xmm0
        movdqa  xmm0, XMMWORD PTR [rdi+16]
 movaps  XMMWORD PTR [rdi+16], xmm1
        movdqa  xmm1, XMMWORD PTR [rsi+32]
        movaps  XMMWORD PTR [rsi+16], xmm0
        movdqa  xmm0, XMMWORD PTR [rdi+32]
        movaps  XMMWORD PTR [rdi+32], xmm1
 movaps  XMMWORD PTR [rsi+32], xmm0
        ret
```
Although g++ memcpy did not recognize the alignas in B_swap1, it is far more better than clang.
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to