https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83479
--- Comment #4 from Daniel Fruzynski <bugzi...@poradnik-webmastera.com> --- Rule No.1: never log bugs before morning coffee ;) This does not produce warnings, compiled with "-O3 -march=haswell -mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd -Wall -Werror". [code] #include "immintrin.h" double test(const double data[9][8]) { __m512d vLastRow, vLastCol, vSqrtRow, vSqrtCol; __m512d v1 = _mm512_load_pd (&data[0][0]); __m512d v2 = _mm512_load_pd (&data[1][0]); __m512d v3 = _mm512_load_pd (&data[2][0]); __m512d v4 = _mm512_load_pd (&data[3][0]); __m512d v5 = _mm512_load_pd (&data[4][0]); __m512d v6 = _mm512_load_pd (&data[5][0]); __m512d v7 = _mm512_load_pd (&data[6][0]); __m512d v8 = _mm512_load_pd (&data[7][0]); // 8 vLastRow = _mm512_load_pd (&data[9][0]); vSqrtRow = _mm512_sqrt_pd(vLastRow); vLastCol = _mm512_set1_pd(vLastRow[0]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[1]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[2]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[3]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v4 = (v4 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[4]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v5 = (v5 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[5]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v6 = (v6 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[6]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v7 = (v7 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[7]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v8 = (v8 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 7 vLastRow = v8; vSqrtRow = _mm512_sqrt_pd(vLastRow); vLastCol = _mm512_set1_pd(vLastRow[0]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[1]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[2]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[3]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v4 = (v4 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[4]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v5 = (v5 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[5]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v6 = (v6 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[6]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v7 = (v7 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 6 vLastRow = v7; vSqrtRow = _mm512_sqrt_pd(vLastRow); vLastCol = _mm512_set1_pd(vLastRow[0]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[1]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[2]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[3]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v4 = (v4 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[4]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v5 = (v5 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[5]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v6 = (v6 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 5 vLastRow = v6; vSqrtRow = _mm512_sqrt_pd(vLastRow); vLastCol = _mm512_set1_pd(vLastRow[0]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[1]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[2]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[3]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v4 = (v4 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[4]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v5 = (v5 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 4 vLastRow = v5; vSqrtRow = _mm512_sqrt_pd(vLastRow); vLastCol = _mm512_set1_pd(vLastRow[0]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[1]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[2]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[3]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v4 = (v4 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 3 vLastRow = v4; vSqrtRow = _mm512_sqrt_pd(vLastRow); vLastCol = _mm512_set1_pd(vLastRow[0]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[1]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[2]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 2 vLastRow = v3; vSqrtRow = _mm512_sqrt_pd(vLastRow); vLastCol = _mm512_set1_pd(vLastRow[0]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm512_set1_pd(vLastRow[1]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 1 vLastRow = v2; vSqrtRow = _mm512_sqrt_pd(vLastRow); vLastCol = _mm512_set1_pd(vLastRow[0]); vSqrtCol = _mm512_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; return v1[0]; } [/code]