https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412

Yichao Yu <yyc1992 at gmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |yyc1992 at gmail dot com

--- Comment #23 from Yichao Yu <yyc1992 at gmail dot com> ---
> It is GCC does not realign the stack at all that is the issue.

I hit another related issue that might confirm this as well.

I noticed this when I tried to manually align the stack with inline assembly.

C++ code reduced from my test case,

```
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>

__attribute__((target("avx")))
__attribute__((noinline)) __m256d f(__m256d x, uint32_t a, const double *p)
{
    __m256d res;
    asm volatile ("vxorpd %0, %0, %0" :
                  "=x"(res), "+x"(x), "+r"(a), "+r"(p) ::
                  "memory", "rax", "rcx", "rdx", "r8", "r9", "r10",
                  "r11", "rbp");
    return res;
}

__attribute__((target("avx")))
__attribute__((noinline)) __m256d f2(__m256d x, uint32_t a, const double *p)
{
    __m256d res;
    asm volatile ("vxorpd %0, %0, %0" :
                  "=x"(res), "+x"(x), "+r"(a), "+r"(p) ::
                  "memory", "rax", "rcx", "rdx", "r8", "r9", "r10",
                  "r11", "rbp");
    return res;
}

__attribute__((target("avx")))
__attribute__((noinline)) __m256d f(__m256d x, __m256d y, __m256d z,
                                    uint32_t a, const double *p)
{
    __m256d res;
    asm volatile ("vxorpd %0, %0, %0" :
                  "=x"(res), "+x"(x), "+x"(y), "+x"(z), "+r"(a), "+r"(p) ::
                  "memory", "rax", "rcx", "rdx", "r8", "r9", "r10",
                  "r11", "rbp");
    return res;
}

const double points[] = {0, 0.1, 0.2, 0.6};

__attribute__((target("avx"))) void test_avx()
{
    f(__m256d{0, 0, 0, 0}, __m256d{0, 0, 0, 0},
                           __m256d{0, 0, 0, 0}, 4, points);
    f(__m256d{0, 0, 0, 0}, 4, points);
}

__attribute__((target("avx"))) void test_avx2()
{
    f2(__m256d{0, 0, 0, 0}, 4, points);
}

static void call_aligned_stack(void (*p)(void))
{
    asm volatile ("movq %%rsp, %%rbp\n"
                  "andq $-64, %%rsp\n"
                  "subq $64, %%rsp\n"
                  "callq *%0\n"
                  "movq %%rbp, %%rsp\n"
                  :: "r"(p)
                  : "memory", "rax", "rcx", "rdx", "r8", "r9", "r10", "r11",
"rbp");
}

int main()
{
    call_aligned_stack(test_avx);
    fprintf(stderr, "aaaa\n");
    fflush(stderr);
    call_aligned_stack(test_avx2);
    return 0;
}
```

(The `fprintf` is there only to make it easier to see when the crash happens.)
The stack alignment code makes sure that the stack is aligned to 64bytes before
making the `call`, which is verified in the debugger, however, when compiled
with GCC 8.2.1 on msys2 (using the mingw-w64-x86_64-gcc package) the `test_avx`
function is happy while `test_avx2` function is not.

Looking at the generated code, for the crashing function:

```
00000000004015c0 <_Z9test_avx2v>:
  4015c0:       48 83 ec 68             sub    $0x68,%rsp
  4015c4:       c5 f9 57 c0             vxorpd %xmm0,%xmm0,%xmm0
  4015c8:       4c 8d 0d 51 7a 00 00    lea    0x7a51(%rip),%r9        # 409020
<_ZL6points>
  4015cf:       41 b8 04 00 00 00       mov    $0x4,%r8d
  4015d5:       48 8d 4c 24 40          lea    0x40(%rsp),%rcx
  4015da:       48 8d 54 24 20          lea    0x20(%rsp),%rdx
  4015df:       c5 fd 29 44 24 20       vmovapd %ymm0,0x20(%rsp)
  4015e5:       c5 f8 77                vzeroupper 
  4015e8:       e8 a3 ff ff ff          callq  401590 <_Z2f2Dv4_djPKd>
  4015ed:       90                      nop
  4015ee:       48 83 c4 68             add    $0x68,%rsp
  4015f2:       c3                      retq   
```

which tries to write with 32byte alignment with a stack offset from the initial
call instruction: -8 - 0x68 + 0x20 = -80.

OTOH, for the "good" function,

```
0000000000401640 <_Z8test_avxv>:
  401640:       57                      push   %rdi
  401641:       56                      push   %rsi
  401642:       53                      push   %rbx
  401643:       48 81 ec b0 00 00 00    sub    $0xb0,%rsp
  40164a:       c5 d9 57 e4             vxorpd %xmm4,%xmm4,%xmm4
  40164e:       48 8d 3d cb 79 00 00    lea    0x79cb(%rip),%rdi        #
409020 <_ZL6points>
  401655:       48 8d 74 24 70          lea    0x70(%rsp),%rsi
  40165a:       4c 8d 4c 24 30          lea    0x30(%rsp),%r9
  40165f:       48 89 7c 24 28          mov    %rdi,0x28(%rsp)
  401664:       48 8d 9c 24 90 00 00    lea    0x90(%rsp),%rbx
  40166b:       00 
  40166c:       4c 8d 44 24 50          lea    0x50(%rsp),%r8
  401671:       48 89 f2                mov    %rsi,%rdx
  401674:       c5 fd 29 64 24 70       vmovapd %ymm4,0x70(%rsp)
  40167a:       48 89 d9                mov    %rbx,%rcx
  40167d:       c5 fd 29 64 24 50       vmovapd %ymm4,0x50(%rsp)
  401683:       c5 fd 29 64 24 30       vmovapd %ymm4,0x30(%rsp)
  401689:       c7 44 24 20 04 00 00    movl   $0x4,0x20(%rsp)
  401690:       00 
  401691:       c5 f8 77                vzeroupper 
  401694:       e8 67 ff ff ff          callq  401600 <_Z1fDv4_dS_S_jPKd>
  401699:       c5 d9 57 e4             vxorpd %xmm4,%xmm4,%xmm4
  40169d:       49 89 f9                mov    %rdi,%r9
  4016a0:       48 89 f2                mov    %rsi,%rdx
  4016a3:       41 b8 04 00 00 00       mov    $0x4,%r8d
  4016a9:       48 89 d9                mov    %rbx,%rcx
  4016ac:       c5 fd 29 64 24 70       vmovapd %ymm4,0x70(%rsp)
  4016b2:       c5 f8 77                vzeroupper 
  4016b5:       e8 a6 fe ff ff          callq  401560 <_Z1fDv4_djPKd>
  4016ba:       90                      nop
  4016bb:       48 81 c4 b0 00 00 00    add    $0xb0,%rsp
  4016c2:       5b                      pop    %rbx
  4016c3:       5e                      pop    %rsi
  4016c4:       5f                      pop    %rdi
  4016c5:       c3                      retq   
```

The stack offset for the 32bytes aligned access is, -8 - 8 * 3 - 0xb0 + 0x70 =
-96, which is different from the previous one by 16 so it seems that GCC isn't
even consistent with itself on what stack alignment it expect.

Reply via email to