Bug#939559: x86_64-w64-mingw32-g++-win32: program compiled with -march=native crashes on same CPU
Hi Bernhard, Stephen, Thanks for the links to the upstream bug reports. Seems a long-standing problem. On 08/09/2019 00:00, Bernhard Übelacker wrote: Not considering any side effects and maybe other instructions with alignment requirements. I used the attached patch which is a bit more brutal (replaces more vmov variants, just in case) and now my real program is crash free. Inspired by https://stackoverflow.com/questions/5983389/how-to-align-stack-at-32-byte-boundary-in-gcc#answer-6025561 . Maybe at least better than disabling AVX2 entirely? Yes. Thanks again, Claude diff -ru src.old/gcc/config/i386/i386.md src.new/gcc/config/i386/i386.md --- src.old/gcc/config/i386/i386.md 2019-01-08 09:56:36.0 + +++ src.new/gcc/config/i386/i386.md 2019-09-08 09:01:02.558639937 +0100 @@ -2055,7 +2055,7 @@ || misaligned_operand (operands[1], XImode)) return "vmovdqu32\t{%1, %0|%0, %1}"; else - return "vmovdqa32\t{%1, %0|%0, %1}"; + return "vmovdqu32\t{%1, %0|%0, %1}"; default: gcc_unreachable (); @@ -2091,11 +2091,11 @@ else { if (get_attr_mode (insn) == MODE_V8SF) - return "vmovaps\t{%1, %0|%0, %1}"; + return "vmovups\t{%1, %0|%0, %1}"; else if (get_attr_mode (insn) == MODE_XI) - return "vmovdqa32\t{%1, %0|%0, %1}"; + return "vmovdqu32\t{%1, %0|%0, %1}"; else - return "vmovdqa\t{%1, %0|%0, %1}"; + return "vmovdqu\t{%1, %0|%0, %1}"; } default: @@ -2153,11 +2153,11 @@ else { if (get_attr_mode (insn) == MODE_V4SF) - return "%vmovaps\t{%1, %0|%0, %1}"; + return "%vmovups\t{%1, %0|%0, %1}"; else if (get_attr_mode (insn) == MODE_XI) - return "vmovdqa32\t{%1, %0|%0, %1}"; + return "vmovdqu32\t{%1, %0|%0, %1}"; else - return "%vmovdqa\t{%1, %0|%0, %1}"; + return "%vmovdqu\t{%1, %0|%0, %1}"; } default: @@ -2263,14 +2263,14 @@ /* Handle AVX512 registers set. */ if (EXT_REX_SSE_REG_P (operands[0]) || EXT_REX_SSE_REG_P (operands[1])) - return "vmovdqa64\t{%1, %0|%0, %1}"; - return "%vmovdqa\t{%1, %0|%0, %1}"; + return "vmovdqu64\t{%1, %0|%0, %1}"; + return "%vmovdqu\t{%1, %0|%0, %1}"; case MODE_V2SF: gcc_assert (!TARGET_AVX); return "movlps\t{%1, %0|%0, %1}"; case MODE_V4SF: - return "%vmovaps\t{%1, %0|%0, %1}"; + return "%vmovups\t{%1, %0|%0, %1}"; default: gcc_unreachable (); @@ -2471,12 +2471,12 @@ case MODE_SI: return "%vmovd\t{%1, %0|%0, %1}"; case MODE_TI: - return "%vmovdqa\t{%1, %0|%0, %1}"; + return "%vmovdqu\t{%1, %0|%0, %1}"; case MODE_XI: - return "vmovdqa32\t{%g1, %g0|%g0, %g1}"; + return "vmovdqu32\t{%g1, %g0|%g0, %g1}"; case MODE_V4SF: - return "%vmovaps\t{%1, %0|%0, %1}"; + return "%vmovups\t{%1, %0|%0, %1}"; case MODE_SF: gcc_assert (!TARGET_AVX); @@ -3351,13 +3351,13 @@ else { if (get_attr_mode (insn) == MODE_V4SF) - return "%vmovaps\t{%1, %0|%0, %1}"; + return "%vmovups\t{%1, %0|%0, %1}"; else if (TARGET_AVX512VL && (EXT_REX_SSE_REG_P (operands[0]) || EXT_REX_SSE_REG_P (operands[1]))) - return "vmovdqa64\t{%1, %0|%0, %1}"; + return "vmovdqu64\t{%1, %0|%0, %1}"; else - return "%vmovdqa\t{%1, %0|%0, %1}"; + return "%vmovdqu\t{%1, %0|%0, %1}"; } case TYPE_MULTI: @@ -3519,11 +3519,11 @@ return "%vmovsd\t{%1, %0|%0, %1}"; case MODE_V4SF: - return "%vmovaps\t{%1, %0|%0, %1}"; + return "%vmovups\t{%1, %0|%0, %1}"; case MODE_V8DF: - return "vmovapd\t{%g1, %g0|%g0, %g1}"; + return "vmovupd\t{%g1, %g0|%g0, %g1}"; case MODE_V2DF: - return "%vmovapd\t{%1, %0|%0, %1}"; + return "%vmovupd\t{%1, %0|%0, %1}"; case MODE_V2SF: gcc_assert (!TARGET_AVX); @@ -3713,9 +3713,9 @@ return "%vmovss\t{%1, %0|%0, %1}"; case MODE_V16SF: - return "vmovaps\t{%g1, %g0|%g0, %g1}"; + return "vmovups\t{%g1, %g0|%g0, %g1}"; case MODE_V4SF: - return "%vmovaps\t{%1, %0|%0, %1}"; + return "%vmovups\t{%1, %0|%0, %1}"; case MODE_SI: return "%vmovd\t{%1, %0|%0, %1}"; diff -ru src.old/gcc/config/i386/mmx.md src.new/gcc/config/i386/mmx.md --- src.old/gcc/config/i386/mmx.md 2018-01-03 10:03:58.0 + +++ src.new/gcc/config/i386/mmx.md 2019-09-08 09:00:47.482620221 +0100 @@ -124,16 +124,16 @@ return "%vmovd\t{%1, %0|%0, %1}"; return "%vmovq\t{%1, %0|%0, %1}"; case MODE_TI: - return "%vmovdqa\t{%1, %0|%0, %1}"; + return "%vmovdqu\t{%1, %0|%0, %1}"; case MODE_XI: - return "vmovdqa64\t{%g1, %g0|%g0, %g1}"; + return "vmovdqu64\t{%g1, %g0|%g0, %g1}"; case MODE_V2SF: if (TARGET_AVX && REG_P (operands[0])) return "vmovlps\t{%1, %0, %0|%0, %0, %1}"; return "%vmovlps\t{%1, %0|%0, %1}"; case MODE_V4SF: - return "%vmovaps\t{%1, %0|%0, %1}"; + return "%vmovups\t{%1, %0|%0, %1}"; default: gcc_unreachable (); diff -ru src.old/gcc/config/i386/sse.md src.new/gcc/config/i386/sse.md --- src.old/gcc/config/i386/sse.md 2019-06-20
Bug#939559: x86_64-w64-mingw32-g++-win32: program compiled with -march=native crashes on same CPU
Hello Stephen, hello Claude, following that previous idea of just replacing the aligned instruction with the unaligned one the hacky patch below got created, just replacing vmovapd by vmovupd. Not considering any side effects and maybe other instructions with alignment requirements. At least a mingw-w64 built with it builds an executable that would not crash in that situation. (Currentyl just meant as a proof of concept.) Maybe at least better than disabling AVX2 entirely? Kind regards, Bernhard --- a/src/gcc/config/i386/i386.md 2019-09-07 15:04:12.391156632 +0200 +++ b/src/gcc/config/i386/i386.md 2019-09-07 15:07:17.316822891 +0200 @@ -3521,9 +3521,9 @@ case MODE_V4SF: return "%vmovaps\t{%1, %0|%0, %1}"; case MODE_V8DF: - return "vmovapd\t{%g1, %g0|%g0, %g1}"; + return "vmovupd\t{%g1, %g0|%g0, %g1}"; case MODE_V2DF: - return "%vmovapd\t{%1, %0|%0, %1}"; + return "%vmovupd\t{%1, %0|%0, %1}"; case MODE_V2SF: gcc_assert (!TARGET_AVX); --- a/src/gcc/config/i386/sse.md 2019-09-07 15:04:12.399156531 +0200 +++ b/src/gcc/config/i386/sse.md 2019-09-07 15:07:09.420922575 +0200 @@ -1004,7 +1004,7 @@ return "vmovaps\t{%g1, %g0|%g0, %g1}"; case MODE_V4DF: case MODE_V2DF: - return "vmovapd\t{%g1, %g0|%g0, %g1}"; + return "vmovupd\t{%g1, %g0|%g0, %g1}"; case MODE_OI: case MODE_TI: return "vmovdqa64\t{%g1, %g0|%g0, %g1}"; @@ -1031,7 +1031,7 @@ || misaligned_operand (operands[1], mode)) return "%vmovupd\t{%1, %0|%0, %1}"; else - return "%vmovapd\t{%1, %0|%0, %1}"; + return "%vmovupd\t{%1, %0|%0, %1}"; case MODE_OI: case MODE_TI: @@ -18952,11 +18952,11 @@ case MODE_V16SF: return "vmovaps\t{%1, %t0|%t0, %1}"; case MODE_V8DF: - return "vmovapd\t{%1, %t0|%t0, %1}"; + return "vmovupd\t{%1, %t0|%t0, %1}"; case MODE_V8SF: return "vmovaps\t{%1, %x0|%x0, %1}"; case MODE_V4DF: - return "vmovapd\t{%1, %x0|%x0, %1}"; + return "vmovupd\t{%1, %x0|%x0, %1}"; case MODE_XI: if (which_alternative == 2) return "vmovdqa\t{%1, %t0|%t0, %1}";
Bug#939559: x86_64-w64-mingw32-g++-win32: program compiled with -march=native crashes on same CPU
Hello Stephen, hello Claude, following discussion seems also related and raises the question if the variable cannot be aligned, could then mingw-w64 just emit the unaligned instructions, even if slower than the aligned ones, which are faster but also crash. https://sourceforge.net/p/mingw-w64/discussion/723797/thread/bc936130/ Kind regards, Bernhard
Bug#939559: x86_64-w64-mingw32-g++-win32: program compiled with -march=native crashes on same CPU
Hi, On Sat, 7 Sep 2019 11:25:45 +0200, Bernhard Übelacker wrote: > Hello Claude Heiland-Allen, > I tried just to collect some more information for the maintainer. Thanks, that’s very useful information (and thanks Claude for the initial report!). > The issue could be reproduced in a qemu VM > with '-cpu host' on a Ryzen 7 1700. Yes, and I can reproduce it on my Haswell-based system too. [...] > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001 > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61730 > https://sourceforge.net/p/mingw-w64/mailman/message/36287627/ OK, so there’s not much that can be done in the package, except documenting the issue (short of disabling AVX2 entirely...). Regards, Stephen pgpZ7FcW3EiSC.pgp Description: OpenPGP digital signature
Bug#939559: x86_64-w64-mingw32-g++-win32: program compiled with -march=native crashes on same CPU
Control: tags -1 + upstream Hello Claude Heiland-Allen, I tried just to collect some more information for the maintainer. The issue could be reproduced in a qemu VM with '-cpu host' on a Ryzen 7 1700. The resulting binary crashes on Windows at the same instruction, so I guess Wine can be ruled out. It seems that the vmovapd instruction expects a 32-byte aligned value in register $rcx. Unfortunately it is just 16-byte aligned and therefore the SIGSEGV. This could be confirmed by setting a breakpoint to that instruction and set $rcx to the next aligned value below, then the vmovapd could be executed. I think following upstream reports and messages are releated, mostly 54412. There are also some workarounds mentioned. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61730 https://sourceforge.net/p/mingw-w64/mailman/message/36287627/ Kind regards, Bernhard $ wine winedbg --gdb a.exe 4 ... Wine-gdb> cont Continuing. Program received signal SIGSEGV, Segmentation fault. 0x0040166d in ?? () Wine-gdb> bt #0 0x0040166d in ?? () #1 0x004031a8 in ?? () #2 0x in ?? () Wine-gdb> print/x $rcx $1 = 0x33fa10 Wine-gdb> print/x $rcx % 32 $2 = 0x10 Wine-gdb> print/x $rcx % 16 $3 = 0x0 $ x86_64-w64-mingw32-objdump --disassemble a.exe ... 00401665 <_Z9broadcastIDv4_ddET_T0_>: 401665: c4 e2 7d 19 c9 vbroadcastsd %xmm1,%ymm1 40166a: 48 89 c8mov%rcx,%rax 40166d: c5 fd 29 09 vmovapd %ymm1,(%rcx) 401671: c5 f8 77vzeroupper 401674: c3 retq ... 00402e90 : ... 40319b: 48 8d 8c 24 e0 00 00lea0xe0(%rsp),%rcx 4031a2: 00 4031a3: e8 bd e4 ff ff callq 401665 <_Z9broadcastIDv4_ddET_T0_> 4031a8: c5 fd 28 94 24 e0 00vmovapd 0xe0(%rsp),%ymm2 ... That function _Z9broadcastIDv4_ddET_T0_ should unmangled look like: double __vector(4) broadcast(double) That should be line 53: 51 template <> 52 __attribute__ ((noinline)) 53 double4 broadcast(double x) { double4 r = { x, x, x, x }; return r; }
Bug#939559: x86_64-w64-mingw32-g++-win32: program compiled with -march=native crashes on same CPU
Package: g++-mingw-w64-x86-64 Version: 8.3.0-19+21.4 Severity: normal File: /usr/bin/x86_64-w64-mingw32-g++-win32 Dear Maintainer, I wrote this C++ source code and saved it in `bug.cpp`: #include #include #include typedef int64_t int2 __attribute__ ((vector_size (16))); typedef int64_t int4 __attribute__ ((vector_size (32))); typedef int64_t int8 __attribute__ ((vector_size (64))); typedef int64_t int16 __attribute__ ((vector_size (128))); __attribute__ ((noinline)) bool any(const int64_t ) { return i; } __attribute__ ((noinline)) bool any(const int2 ) { return (((i[0] || i[1]))); } __attribute__ ((noinline)) bool any(const int4 ) { return (((i[0] || i[1]) || (i[2] || i[3]))); } __attribute__ ((noinline)) bool any(const int8 ) { return (((i[0] || i[1]) || (i[2] || i[3])) || ((i[4] || i[5]) || (i[6] || i[7]))); } __attribute__ ((noinline)) bool any(const int16 ) { return (((i[0] || i[1]) || (i[2] || i[3])) || ((i[4] || i[5]) || (i[6] || i[7]))) || (((i[8] || i[9]) || (i[10] || i[11])) || ((i[12] || i[13]) || (i[14] || i[15]))); } typedef double double2 __attribute__ ((vector_size (16))); typedef double double4 __attribute__ ((vector_size (32))); typedef double double8 __attribute__ ((vector_size (64))); typedef double double16 __attribute__ ((vector_size (128))); template __attribute__ ((noinline)) R broadcast(T x) { return R(x); } template <> __attribute__ ((noinline)) double2 broadcast(double x) { double2 r = { x, x }; return r; } template <> __attribute__ ((noinline)) double4 broadcast(double x) { double4 r = { x, x, x, x }; return r; } template <> __attribute__ ((noinline)) double8 broadcast(double x) { double8 r = { x, x, x, x, x, x, x, x }; return r; } template <> __attribute__ ((noinline)) double16 broadcast(double x) { double16 r = { x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x }; return r; } int main(int argc, char **argv) { if (argc > 1) { int N = atoi(argv[1]); switch (N) { case 1: { double x = broadcast(1.0); return any(x != 1.0); } case 2: { double2 x = broadcast(1.0); return any(x != 1.0); } case 4: { double4 x = broadcast(1.0); return any(x != 1.0); } case 8: { double8 x = broadcast(1.0); return any(x != 1.0); } case 16: { double16 x = broadcast(1.0); return any(x != 1.0); } } } return 1; } I compiled it like this: $ x86_64-w64-mingw32-g++ bug.cpp -march=native -O3 bug.cpp: In function ‘R broadcast(T) [with R = __vector(8) double; T = double]’: bug.cpp:56:45: warning: AVX512F vector return without AVX512F enabled changes the ABI [-Wpsabi] double8 broadcast(double x) { double8 r = { x, x, x, x, x, x, x, x }; return r; } ^ I ran it on the same host like this, and it crashed: $ ./a.exe 4 wine: Unhandled page fault on read access to 0x at address 0x4016a8 (thread 002a), starting debugger... 002c:fixme:dbghelp:elf_search_auxv can't find symbol in module 002c:fixme:dbghelp:elf_search_auxv can't find symbol in module 002c:fixme:dbghelp:elf_search_auxv can't find symbol in module 002c:fixme:dbghelp:elf_search_auxv can't find symbol in module 002c:fixme:dbghelp:elf_search_auxv can't find symbol in module Unhandled exception: page fault on read access to 0x in 64-bit code (0x004016a8). 002c:fixme:dbghelp:elf_search_auxv can't find symbol in module 002c:fixme:dbghelp:interpret_function_table_entry PUSH_MACHFRAME 6 Register dump: rip:004016a8 rsp:0032f928 rbp:004619e0 eflags:00010203 ( R- -- I - - -C) rax:0032fa10 rbx:0032fa80 rcx:0032fa10 rdx:00405000 rsi:0002 rdi:004619e0 r8:ffd0 r9: r10:0002 r11:00461a68 r12:0018 r13:0010 r14: r15: Stack dump: 0x0032f928: 004031e8 0x0032f938: 0x0032f948: 00202020 0x0032f958: 2000 00ff 0x0032f968: 00ff 00202020 0x0032f978: 0x0032f988: 0x0032f998: 0x0032f9a8: 0x0032f9b8: 0x0032f9c8: 0x0032f9d8: