https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
Yichao Yu <yyc1992 at gmail dot com> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |yyc1992 at gmail dot com --- Comment #23 from Yichao Yu <yyc1992 at gmail dot com> --- > It is GCC does not realign the stack at all that is the issue. I hit another related issue that might confirm this as well. I noticed this when I tried to manually align the stack with inline assembly. C++ code reduced from my test case, ``` #include <immintrin.h> #include <stdio.h> #include <stdint.h> __attribute__((target("avx"))) __attribute__((noinline)) __m256d f(__m256d x, uint32_t a, const double *p) { __m256d res; asm volatile ("vxorpd %0, %0, %0" : "=x"(res), "+x"(x), "+r"(a), "+r"(p) :: "memory", "rax", "rcx", "rdx", "r8", "r9", "r10", "r11", "rbp"); return res; } __attribute__((target("avx"))) __attribute__((noinline)) __m256d f2(__m256d x, uint32_t a, const double *p) { __m256d res; asm volatile ("vxorpd %0, %0, %0" : "=x"(res), "+x"(x), "+r"(a), "+r"(p) :: "memory", "rax", "rcx", "rdx", "r8", "r9", "r10", "r11", "rbp"); return res; } __attribute__((target("avx"))) __attribute__((noinline)) __m256d f(__m256d x, __m256d y, __m256d z, uint32_t a, const double *p) { __m256d res; asm volatile ("vxorpd %0, %0, %0" : "=x"(res), "+x"(x), "+x"(y), "+x"(z), "+r"(a), "+r"(p) :: "memory", "rax", "rcx", "rdx", "r8", "r9", "r10", "r11", "rbp"); return res; } const double points[] = {0, 0.1, 0.2, 0.6}; __attribute__((target("avx"))) void test_avx() { f(__m256d{0, 0, 0, 0}, __m256d{0, 0, 0, 0}, __m256d{0, 0, 0, 0}, 4, points); f(__m256d{0, 0, 0, 0}, 4, points); } __attribute__((target("avx"))) void test_avx2() { f2(__m256d{0, 0, 0, 0}, 4, points); } static void call_aligned_stack(void (*p)(void)) { asm volatile ("movq %%rsp, %%rbp\n" "andq $-64, %%rsp\n" "subq $64, %%rsp\n" "callq *%0\n" "movq %%rbp, %%rsp\n" :: "r"(p) : "memory", "rax", "rcx", "rdx", "r8", "r9", "r10", "r11", "rbp"); } int main() { call_aligned_stack(test_avx); fprintf(stderr, "aaaa\n"); fflush(stderr); call_aligned_stack(test_avx2); return 0; } ``` (The `fprintf` is there only to make it easier to see when the crash happens.) The stack alignment code makes sure that the stack is aligned to 64bytes before making the `call`, which is verified in the debugger, however, when compiled with GCC 8.2.1 on msys2 (using the mingw-w64-x86_64-gcc package) the `test_avx` function is happy while `test_avx2` function is not. Looking at the generated code, for the crashing function: ``` 00000000004015c0 <_Z9test_avx2v>: 4015c0: 48 83 ec 68 sub $0x68,%rsp 4015c4: c5 f9 57 c0 vxorpd %xmm0,%xmm0,%xmm0 4015c8: 4c 8d 0d 51 7a 00 00 lea 0x7a51(%rip),%r9 # 409020 <_ZL6points> 4015cf: 41 b8 04 00 00 00 mov $0x4,%r8d 4015d5: 48 8d 4c 24 40 lea 0x40(%rsp),%rcx 4015da: 48 8d 54 24 20 lea 0x20(%rsp),%rdx 4015df: c5 fd 29 44 24 20 vmovapd %ymm0,0x20(%rsp) 4015e5: c5 f8 77 vzeroupper 4015e8: e8 a3 ff ff ff callq 401590 <_Z2f2Dv4_djPKd> 4015ed: 90 nop 4015ee: 48 83 c4 68 add $0x68,%rsp 4015f2: c3 retq ``` which tries to write with 32byte alignment with a stack offset from the initial call instruction: -8 - 0x68 + 0x20 = -80. OTOH, for the "good" function, ``` 0000000000401640 <_Z8test_avxv>: 401640: 57 push %rdi 401641: 56 push %rsi 401642: 53 push %rbx 401643: 48 81 ec b0 00 00 00 sub $0xb0,%rsp 40164a: c5 d9 57 e4 vxorpd %xmm4,%xmm4,%xmm4 40164e: 48 8d 3d cb 79 00 00 lea 0x79cb(%rip),%rdi # 409020 <_ZL6points> 401655: 48 8d 74 24 70 lea 0x70(%rsp),%rsi 40165a: 4c 8d 4c 24 30 lea 0x30(%rsp),%r9 40165f: 48 89 7c 24 28 mov %rdi,0x28(%rsp) 401664: 48 8d 9c 24 90 00 00 lea 0x90(%rsp),%rbx 40166b: 00 40166c: 4c 8d 44 24 50 lea 0x50(%rsp),%r8 401671: 48 89 f2 mov %rsi,%rdx 401674: c5 fd 29 64 24 70 vmovapd %ymm4,0x70(%rsp) 40167a: 48 89 d9 mov %rbx,%rcx 40167d: c5 fd 29 64 24 50 vmovapd %ymm4,0x50(%rsp) 401683: c5 fd 29 64 24 30 vmovapd %ymm4,0x30(%rsp) 401689: c7 44 24 20 04 00 00 movl $0x4,0x20(%rsp) 401690: 00 401691: c5 f8 77 vzeroupper 401694: e8 67 ff ff ff callq 401600 <_Z1fDv4_dS_S_jPKd> 401699: c5 d9 57 e4 vxorpd %xmm4,%xmm4,%xmm4 40169d: 49 89 f9 mov %rdi,%r9 4016a0: 48 89 f2 mov %rsi,%rdx 4016a3: 41 b8 04 00 00 00 mov $0x4,%r8d 4016a9: 48 89 d9 mov %rbx,%rcx 4016ac: c5 fd 29 64 24 70 vmovapd %ymm4,0x70(%rsp) 4016b2: c5 f8 77 vzeroupper 4016b5: e8 a6 fe ff ff callq 401560 <_Z1fDv4_djPKd> 4016ba: 90 nop 4016bb: 48 81 c4 b0 00 00 00 add $0xb0,%rsp 4016c2: 5b pop %rbx 4016c3: 5e pop %rsi 4016c4: 5f pop %rdi 4016c5: c3 retq ``` The stack offset for the 32bytes aligned access is, -8 - 8 * 3 - 0xb0 + 0x70 = -96, which is different from the previous one by 16 so it seems that GCC isn't even consistent with itself on what stack alignment it expect.