Bug#939559: x86_64-w64-mingw32-g++-win32: program compiled with -march=native crashes on same CPU

2019-09-08 Thread Claude Heiland-Allen

Hi Bernhard, Stephen,

Thanks for the links to the upstream bug reports.  Seems a long-standing 
problem.


On 08/09/2019 00:00, Bernhard Übelacker wrote:

Not considering any side effects and maybe other
instructions with alignment requirements.
I used the attached patch which is a bit more brutal (replaces more vmov 
variants, just in case) and now my real program is crash free. Inspired by
https://stackoverflow.com/questions/5983389/how-to-align-stack-at-32-byte-boundary-in-gcc#answer-6025561 
.



Maybe at least better than disabling AVX2 entirely?

Yes.  Thanks again,


Claude

diff -ru src.old/gcc/config/i386/i386.md src.new/gcc/config/i386/i386.md
--- src.old/gcc/config/i386/i386.md	2019-01-08 09:56:36.0 +
+++ src.new/gcc/config/i386/i386.md	2019-09-08 09:01:02.558639937 +0100
@@ -2055,7 +2055,7 @@
 	  || misaligned_operand (operands[1], XImode))
 	return "vmovdqu32\t{%1, %0|%0, %1}";
   else
-	return "vmovdqa32\t{%1, %0|%0, %1}";
+	return "vmovdqu32\t{%1, %0|%0, %1}";
 
 default:
   gcc_unreachable ();
@@ -2091,11 +2091,11 @@
   else
 	{
 	  if (get_attr_mode (insn) == MODE_V8SF)
-	return "vmovaps\t{%1, %0|%0, %1}";
+	return "vmovups\t{%1, %0|%0, %1}";
 	  else if (get_attr_mode (insn) == MODE_XI)
-	return "vmovdqa32\t{%1, %0|%0, %1}";
+	return "vmovdqu32\t{%1, %0|%0, %1}";
 	  else
-	return "vmovdqa\t{%1, %0|%0, %1}";
+	return "vmovdqu\t{%1, %0|%0, %1}";
 	}
 
 default:
@@ -2153,11 +2153,11 @@
   else
 	{
 	  if (get_attr_mode (insn) == MODE_V4SF)
-	return "%vmovaps\t{%1, %0|%0, %1}";
+	return "%vmovups\t{%1, %0|%0, %1}";
 	  else if (get_attr_mode (insn) == MODE_XI)
-	return "vmovdqa32\t{%1, %0|%0, %1}";
+	return "vmovdqu32\t{%1, %0|%0, %1}";
 	  else
-	return "%vmovdqa\t{%1, %0|%0, %1}";
+	return "%vmovdqu\t{%1, %0|%0, %1}";
 	}
 
 default:
@@ -2263,14 +2263,14 @@
 	  /* Handle AVX512 registers set.  */
 	  if (EXT_REX_SSE_REG_P (operands[0])
 	  || EXT_REX_SSE_REG_P (operands[1]))
-	return "vmovdqa64\t{%1, %0|%0, %1}";
-	  return "%vmovdqa\t{%1, %0|%0, %1}";
+	return "vmovdqu64\t{%1, %0|%0, %1}";
+	  return "%vmovdqu\t{%1, %0|%0, %1}";
 
 	case MODE_V2SF:
 	  gcc_assert (!TARGET_AVX);
 	  return "movlps\t{%1, %0|%0, %1}";
 	case MODE_V4SF:
-	  return "%vmovaps\t{%1, %0|%0, %1}";
+	  return "%vmovups\t{%1, %0|%0, %1}";
 
 	default:
 	  gcc_unreachable ();
@@ -2471,12 +2471,12 @@
 	case MODE_SI:
   return "%vmovd\t{%1, %0|%0, %1}";
 	case MODE_TI:
-	  return "%vmovdqa\t{%1, %0|%0, %1}";
+	  return "%vmovdqu\t{%1, %0|%0, %1}";
 	case MODE_XI:
-	  return "vmovdqa32\t{%g1, %g0|%g0, %g1}";
+	  return "vmovdqu32\t{%g1, %g0|%g0, %g1}";
 
 	case MODE_V4SF:
-	  return "%vmovaps\t{%1, %0|%0, %1}";
+	  return "%vmovups\t{%1, %0|%0, %1}";
 
 	case MODE_SF:
 	  gcc_assert (!TARGET_AVX);
@@ -3351,13 +3351,13 @@
   else
 	{
 	  if (get_attr_mode (insn) == MODE_V4SF)
-	return "%vmovaps\t{%1, %0|%0, %1}";
+	return "%vmovups\t{%1, %0|%0, %1}";
 	  else if (TARGET_AVX512VL
 		   && (EXT_REX_SSE_REG_P (operands[0])
 		   || EXT_REX_SSE_REG_P (operands[1])))
-	return "vmovdqa64\t{%1, %0|%0, %1}";
+	return "vmovdqu64\t{%1, %0|%0, %1}";
 	  else
-	return "%vmovdqa\t{%1, %0|%0, %1}";
+	return "%vmovdqu\t{%1, %0|%0, %1}";
 	}
 
 case TYPE_MULTI:
@@ -3519,11 +3519,11 @@
 	  return "%vmovsd\t{%1, %0|%0, %1}";
 
 	case MODE_V4SF:
-	  return "%vmovaps\t{%1, %0|%0, %1}";
+	  return "%vmovups\t{%1, %0|%0, %1}";
 	case MODE_V8DF:
-	  return "vmovapd\t{%g1, %g0|%g0, %g1}";
+	  return "vmovupd\t{%g1, %g0|%g0, %g1}";
 	case MODE_V2DF:
-	  return "%vmovapd\t{%1, %0|%0, %1}";
+	  return "%vmovupd\t{%1, %0|%0, %1}";
 
 	case MODE_V2SF:
 	  gcc_assert (!TARGET_AVX);
@@ -3713,9 +3713,9 @@
 	  return "%vmovss\t{%1, %0|%0, %1}";
 
 	case MODE_V16SF:
-	  return "vmovaps\t{%g1, %g0|%g0, %g1}";
+	  return "vmovups\t{%g1, %g0|%g0, %g1}";
 	case MODE_V4SF:
-	  return "%vmovaps\t{%1, %0|%0, %1}";
+	  return "%vmovups\t{%1, %0|%0, %1}";
 
 	case MODE_SI:
 	  return "%vmovd\t{%1, %0|%0, %1}";
diff -ru src.old/gcc/config/i386/mmx.md src.new/gcc/config/i386/mmx.md
--- src.old/gcc/config/i386/mmx.md	2018-01-03 10:03:58.0 +
+++ src.new/gcc/config/i386/mmx.md	2019-09-08 09:00:47.482620221 +0100
@@ -124,16 +124,16 @@
 	return "%vmovd\t{%1, %0|%0, %1}";
 	  return "%vmovq\t{%1, %0|%0, %1}";
 	case MODE_TI:
-	  return "%vmovdqa\t{%1, %0|%0, %1}";
+	  return "%vmovdqu\t{%1, %0|%0, %1}";
 	case MODE_XI:
-	  return "vmovdqa64\t{%g1, %g0|%g0, %g1}";
+	  return "vmovdqu64\t{%g1, %g0|%g0, %g1}";
 
 	case MODE_V2SF:
 	  if (TARGET_AVX && REG_P (operands[0]))
 	return "vmovlps\t{%1, %0, %0|%0, %0, %1}";
 	  return "%vmovlps\t{%1, %0|%0, %1}";
 	case MODE_V4SF:
-	  return "%vmovaps\t{%1, %0|%0, %1}";
+	  return "%vmovups\t{%1, %0|%0, %1}";
 
 	default:
 	  gcc_unreachable ();
diff -ru src.old/gcc/config/i386/sse.md src.new/gcc/config/i386/sse.md
--- src.old/gcc/config/i386/sse.md	2019-06-20 

Bug#939559: x86_64-w64-mingw32-g++-win32: program compiled with -march=native crashes on same CPU

2019-09-07 Thread Bernhard Übelacker
Hello Stephen, hello Claude,
following that previous idea of just replacing the aligned
instruction with the unaligned one the hacky patch below got
created, just replacing vmovapd by vmovupd.
Not considering any side effects and maybe other
instructions with alignment requirements.
At least a mingw-w64 built with it builds an executable that
would not crash in that situation.
(Currentyl just meant as a proof of concept.)
Maybe at least better than disabling AVX2 entirely?

Kind regards,
Bernhard
--- a/src/gcc/config/i386/i386.md	2019-09-07 15:04:12.391156632 +0200
+++ b/src/gcc/config/i386/i386.md	2019-09-07 15:07:17.316822891 +0200
@@ -3521,9 +3521,9 @@
 	case MODE_V4SF:
 	  return "%vmovaps\t{%1, %0|%0, %1}";
 	case MODE_V8DF:
-	  return "vmovapd\t{%g1, %g0|%g0, %g1}";
+	  return "vmovupd\t{%g1, %g0|%g0, %g1}";
 	case MODE_V2DF:
-	  return "%vmovapd\t{%1, %0|%0, %1}";
+	  return "%vmovupd\t{%1, %0|%0, %1}";
 
 	case MODE_V2SF:
 	  gcc_assert (!TARGET_AVX);
--- a/src/gcc/config/i386/sse.md	2019-09-07 15:04:12.399156531 +0200
+++ b/src/gcc/config/i386/sse.md	2019-09-07 15:07:09.420922575 +0200
@@ -1004,7 +1004,7 @@
 		return "vmovaps\t{%g1, %g0|%g0, %g1}";
 	  case MODE_V4DF:
 	  case MODE_V2DF:
-		return "vmovapd\t{%g1, %g0|%g0, %g1}";
+		return "vmovupd\t{%g1, %g0|%g0, %g1}";
 	  case MODE_OI:
 	  case MODE_TI:
 		return "vmovdqa64\t{%g1, %g0|%g0, %g1}";
@@ -1031,7 +1031,7 @@
 	  || misaligned_operand (operands[1], mode))
 	return "%vmovupd\t{%1, %0|%0, %1}";
 	  else
-	return "%vmovapd\t{%1, %0|%0, %1}";
+	return "%vmovupd\t{%1, %0|%0, %1}";
 
 	case MODE_OI:
 	case MODE_TI:
@@ -18952,11 +18952,11 @@
 	case MODE_V16SF:
 	  return "vmovaps\t{%1, %t0|%t0, %1}";
 	case MODE_V8DF:
-	  return "vmovapd\t{%1, %t0|%t0, %1}";
+	  return "vmovupd\t{%1, %t0|%t0, %1}";
 	case MODE_V8SF:
 	  return "vmovaps\t{%1, %x0|%x0, %1}";
 	case MODE_V4DF:
-	  return "vmovapd\t{%1, %x0|%x0, %1}";
+	  return "vmovupd\t{%1, %x0|%x0, %1}";
 	case MODE_XI:
 	  if (which_alternative == 2)
 	return "vmovdqa\t{%1, %t0|%t0, %1}";


Bug#939559: x86_64-w64-mingw32-g++-win32: program compiled with -march=native crashes on same CPU

2019-09-07 Thread Bernhard Übelacker
Hello Stephen, hello Claude,
following discussion seems also related and raises the question
if the variable cannot be aligned, could then mingw-w64 just emit
the unaligned instructions, even if slower than the aligned ones,
which are faster but also crash.

https://sourceforge.net/p/mingw-w64/discussion/723797/thread/bc936130/

Kind regards,
Bernhard



Bug#939559: x86_64-w64-mingw32-g++-win32: program compiled with -march=native crashes on same CPU

2019-09-07 Thread Stephen Kitt
Hi,

On Sat, 7 Sep 2019 11:25:45 +0200, Bernhard Übelacker 
wrote:
> Hello Claude Heiland-Allen,
> I tried just to collect some more information for the maintainer.

Thanks, that’s very useful information (and thanks Claude for the initial
report!).

> The issue could be reproduced in a qemu VM
> with '-cpu host' on a Ryzen 7 1700.

Yes, and I can reproduce it on my Haswell-based system too.

[...]
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61730
> https://sourceforge.net/p/mingw-w64/mailman/message/36287627/

OK, so there’s not much that can be done in the package, except documenting
the issue (short of disabling AVX2 entirely...).

Regards,

Stephen


pgpZ7FcW3EiSC.pgp
Description: OpenPGP digital signature


Bug#939559: x86_64-w64-mingw32-g++-win32: program compiled with -march=native crashes on same CPU

2019-09-07 Thread Bernhard Übelacker
Control: tags -1 + upstream


Hello Claude Heiland-Allen,
I tried just to collect some more information for the maintainer.

The issue could be reproduced in a qemu VM
with '-cpu host' on a Ryzen 7 1700.

The resulting binary crashes on Windows at the same instruction,
so I guess Wine can be ruled out.

It seems that the vmovapd instruction expects a 32-byte aligned 
value in register $rcx. Unfortunately it is just 16-byte aligned
and therefore the SIGSEGV.
This could be confirmed by setting a breakpoint to that instruction
and set $rcx to the next aligned value below, then the vmovapd
could be executed.

I think following upstream reports and messages are releated,
mostly 54412. There are also some workarounds mentioned.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61730
https://sourceforge.net/p/mingw-w64/mailman/message/36287627/

Kind regards,
Bernhard



$ wine winedbg --gdb a.exe 4
...
Wine-gdb> cont
Continuing.

Program received signal SIGSEGV, Segmentation fault.
0x0040166d in ?? ()
Wine-gdb> bt
#0  0x0040166d in ?? ()
#1  0x004031a8 in ?? ()
#2  0x in ?? ()
Wine-gdb> print/x $rcx
$1 = 0x33fa10
Wine-gdb> print/x $rcx % 32
$2 = 0x10
Wine-gdb> print/x $rcx % 16
$3 = 0x0



$ x86_64-w64-mingw32-objdump --disassemble a.exe
...
00401665 <_Z9broadcastIDv4_ddET_T0_>:
  401665:   c4 e2 7d 19 c9  vbroadcastsd %xmm1,%ymm1
  40166a:   48 89 c8mov%rcx,%rax
  40166d:   c5 fd 29 09 vmovapd %ymm1,(%rcx)
  401671:   c5 f8 77vzeroupper 
  401674:   c3  retq   
...
00402e90 :
...
  40319b:   48 8d 8c 24 e0 00 00lea0xe0(%rsp),%rcx
  4031a2:   00 
  4031a3:   e8 bd e4 ff ff  callq  401665 
<_Z9broadcastIDv4_ddET_T0_>
  4031a8:   c5 fd 28 94 24 e0 00vmovapd 0xe0(%rsp),%ymm2
...



That function _Z9broadcastIDv4_ddET_T0_ should unmangled look like:
double __vector(4) broadcast(double)
That should be line 53:
51 template <>
52 __attribute__ ((noinline))
53 double4  broadcast(double x) { double4 r = { x, x, x, x 
}; return r; }



Bug#939559: x86_64-w64-mingw32-g++-win32: program compiled with -march=native crashes on same CPU

2019-09-06 Thread Claude Heiland-Allen

Package: g++-mingw-w64-x86-64
Version: 8.3.0-19+21.4
Severity: normal
File: /usr/bin/x86_64-w64-mingw32-g++-win32

Dear Maintainer,

I wrote this C++ source code and saved it in `bug.cpp`:

    #include 
    #include 
    #include 

    typedef int64_t int2  __attribute__ ((vector_size (16)));
    typedef int64_t int4  __attribute__ ((vector_size (32)));
    typedef int64_t int8  __attribute__ ((vector_size (64)));
    typedef int64_t int16 __attribute__ ((vector_size (128)));

    __attribute__ ((noinline))
    bool any(const int64_t ) {
  return i;
    }

    __attribute__ ((noinline))
    bool any(const int2 ) {
  return (((i[0] || i[1])));
    }

    __attribute__ ((noinline))
    bool any(const int4 ) {
  return (((i[0] || i[1]) || (i[2] || i[3])));
    }

    __attribute__ ((noinline))
    bool any(const int8 ) {
  return (((i[0] || i[1]) || (i[2] || i[3])) ||
 ((i[4] || i[5]) || (i[6] || i[7])));
    }

    __attribute__ ((noinline))
    bool any(const int16 ) {
  return (((i[0] || i[1]) || (i[2] || i[3])) ||
 ((i[4] || i[5]) || (i[6] || i[7]))) ||
 (((i[8] || i[9]) || (i[10] || i[11])) ||
 ((i[12] || i[13]) || (i[14] || i[15])));
    }

    typedef double double2  __attribute__ ((vector_size (16)));
    typedef double double4  __attribute__ ((vector_size (32)));
    typedef double double8  __attribute__ ((vector_size (64)));
    typedef double double16 __attribute__ ((vector_size (128)));

    template 
    __attribute__ ((noinline))
    R broadcast(T x) { return R(x); }

    template <>
    __attribute__ ((noinline))
    double2  broadcast(double x) { double2 r = { x, x 
}; return r; }

    template <>
    __attribute__ ((noinline))
    double4  broadcast(double x) { double4 r = { x, x, 
x, x }; return r; }

    template <>
    __attribute__ ((noinline))
    double8  broadcast(double x) { double8 r = { x, x, 
x, x, x, x, x, x }; return r; }

    template <>
    __attribute__ ((noinline))
    double16 broadcast(double x) { double16 r = { x, 
x, x, x, x, x, x, x, x, x, x, x, x, x, x, x }; return r; }


    int main(int argc, char **argv)
    {
  if (argc > 1)
  {
    int N = atoi(argv[1]);
    switch (N)
    {
  case 1:  { double   x = broadcast(1.0); 
return any(x != 1.0); }
  case 2:  { double2  x = broadcast(1.0); 
return any(x != 1.0); }
  case 4:  { double4  x = broadcast(1.0); 
return any(x != 1.0); }
  case 8:  { double8  x = broadcast(1.0); 
return any(x != 1.0); }
  case 16: { double16 x = broadcast(1.0); 
return any(x != 1.0); }

    }
  }
  return 1;
    }

I compiled it like this:

    $ x86_64-w64-mingw32-g++ bug.cpp -march=native -O3
    bug.cpp: In function ‘R broadcast(T) [with R = __vector(8) double; 
T = double]’:
    bug.cpp:56:45: warning: AVX512F vector return without AVX512F 
enabled changes the ABI [-Wpsabi]
 double8  broadcast(double x) { double8  r = { x, 
x, x, x, x, x, x, x }; return r; }

 ^

I ran it on the same host like this, and it crashed:

    $ ./a.exe 4
    wine: Unhandled page fault on read access to 0x at 
address 0x4016a8 (thread 002a), starting debugger...

    002c:fixme:dbghelp:elf_search_auxv can't find symbol in module
    002c:fixme:dbghelp:elf_search_auxv can't find symbol in module
    002c:fixme:dbghelp:elf_search_auxv can't find symbol in module
    002c:fixme:dbghelp:elf_search_auxv can't find symbol in module
    002c:fixme:dbghelp:elf_search_auxv can't find symbol in module
    Unhandled exception: page fault on read access to 
0x in 64-bit code (0x004016a8).

    002c:fixme:dbghelp:elf_search_auxv can't find symbol in module
    002c:fixme:dbghelp:interpret_function_table_entry PUSH_MACHFRAME 6
    Register dump:
 rip:004016a8 rsp:0032f928 rbp:004619e0 
eflags:00010203 (  R- --  I   - - -C)
 rax:0032fa10 rbx:0032fa80 rcx:0032fa10 
rdx:00405000
 rsi:0002 rdi:004619e0 r8:ffd0  
r9: r10:0002
 r11:00461a68 r12:0018 r13:0010 
r14: r15:

    Stack dump:
    0x0032f928:  004031e8 
    0x0032f938:   
    0x0032f948:   00202020
    0x0032f958:  2000 00ff
    0x0032f968:  00ff 00202020
    0x0032f978:   
    0x0032f988:   
    0x0032f998:   
    0x0032f9a8:   
    0x0032f9b8:   
    0x0032f9c8:   
    0x0032f9d8: