https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83479
Bug ID: 83479 Summary: Register spilling in AVX code Product: gcc Version: 7.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: bugzi...@poradnik-webmastera.com Target Milestone: --- Here is snipped of code which performs some calculations on matrix. It repeatedly transforms some (N * N) matrix into (N-1 * N-1) one, and returns final scalar value. gcc for some reason is not able to detect that intermediate values are not needed anymore, and starts spilling. Code below is from gcc 7.2, trunk version also generates similar code. Code was compiled with "-O3 -march=haswell". BTW, clang 5 properly handles this and does not spill. [code] #include "immintrin.h" double test(const double data[9][8]) { __m256d vLastRow, vLastCol, vSqrtRow, vSqrtCol; __m256d v1 = _mm256_load_pd (&data[0][0]); __m256d v2 = _mm256_load_pd (&data[1][0]); __m256d v3 = _mm256_load_pd (&data[2][0]); __m256d v4 = _mm256_load_pd (&data[3][0]); __m256d v5 = _mm256_load_pd (&data[4][0]); __m256d v6 = _mm256_load_pd (&data[5][0]); __m256d v7 = _mm256_load_pd (&data[6][0]); __m256d v8 = _mm256_load_pd (&data[7][0]); // 8 vLastRow = _mm256_load_pd (&data[9][0]); vSqrtRow = _mm256_sqrt_pd(vLastRow); vLastCol = _mm256_set1_pd(vLastRow[0]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[1]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[2]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[3]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v4 = (v4 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[4]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v5 = (v5 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[5]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v6 = (v6 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[6]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v7 = (v7 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[7]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v8 = (v8 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 7 vLastRow = v8; vSqrtRow = _mm256_sqrt_pd(vLastRow); vLastCol = _mm256_set1_pd(vLastRow[0]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[1]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[2]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[3]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v4 = (v4 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[4]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v5 = (v5 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[5]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v6 = (v6 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[6]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v7 = (v7 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 6 vLastRow = v7; vSqrtRow = _mm256_sqrt_pd(vLastRow); vLastCol = _mm256_set1_pd(vLastRow[0]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[1]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[2]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[3]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v4 = (v4 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[4]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v5 = (v5 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[5]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v6 = (v6 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 5 vLastRow = v6; vSqrtRow = _mm256_sqrt_pd(vLastRow); vLastCol = _mm256_set1_pd(vLastRow[0]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[1]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[2]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[3]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v4 = (v4 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[4]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v5 = (v5 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 4 vLastRow = v5; vSqrtRow = _mm256_sqrt_pd(vLastRow); vLastCol = _mm256_set1_pd(vLastRow[0]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[1]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[2]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[3]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v4 = (v4 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 3 vLastRow = v4; vSqrtRow = _mm256_sqrt_pd(vLastRow); vLastCol = _mm256_set1_pd(vLastRow[0]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[1]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[2]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v3 = (v3 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 2 vLastRow = v3; vSqrtRow = _mm256_sqrt_pd(vLastRow); vLastCol = _mm256_set1_pd(vLastRow[0]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; vLastCol = _mm256_set1_pd(vLastRow[1]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v2 = (v2 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; // 1 vLastRow = v2; vSqrtRow = _mm256_sqrt_pd(vLastRow); vLastCol = _mm256_set1_pd(vLastRow[0]); vSqrtCol = _mm256_sqrt_pd(vLastCol); v1 = (v1 - vLastRow * vLastCol) * vSqrtRow * vSqrtCol; return v1[0]; } [/code] [out] test(double const (*) [8]): lea r10, [rsp+8] and rsp, -32 push QWORD PTR [r10-8] push rbp mov rbp, rsp push r10 sub rsp, 1040 vmovapd ymm7, YMMWORD PTR [rdi+576] vbroadcastsd ymm0, QWORD PTR [rbp-16] vpermpd ymm2, ymm7, 0 vsqrtpd ymm15, ymm7 vpermpd ymm12, ymm7, 255 vsqrtpd ymm5, ymm2 vsqrtpd ymm4, ymm12 vmovapd YMMWORD PTR [rbp-560], ymm2 vsqrtpd ymm2, ymm0 vmovapd YMMWORD PTR [rbp-592], ymm5 vpermpd ymm5, ymm7, 85 vmovapd YMMWORD PTR [rbp-528], ymm5 vsqrtpd ymm6, ymm5 vbroadcastsd ymm5, QWORD PTR [rbp+8] vmovapd YMMWORD PTR [rbp-208], ymm4 vbroadcastsd ymm4, QWORD PTR [rbp+0] vmovapd ymm14, ymm5 vsqrtpd ymm9, ymm5 vfnmadd213pd ymm14, ymm7, YMMWORD PTR [rdi+448] vsqrtpd ymm8, ymm4 vmovapd YMMWORD PTR [rbp-624], ymm6 vpermpd ymm6, ymm7, 170 vmovapd YMMWORD PTR [rbp-496], ymm6 vsqrtpd ymm1, ymm6 vmulpd ymm6, ymm14, ymm15 vmovapd YMMWORD PTR [rbp-656], ymm1 vbroadcastsd ymm1, QWORD PTR [rbp-8] vsqrtpd ymm3, ymm1 vmulpd ymm6, ymm6, ymm9 vpermpd ymm13, ymm6, 0 vsqrtpd ymm14, ymm6 vsqrtpd ymm10, ymm13 vmovapd YMMWORD PTR [rbp-464], ymm13 vpermpd ymm13, ymm6, 170 vmovapd YMMWORD PTR [rbp-688], ymm10 vpermpd ymm10, ymm6, 85 vsqrtpd ymm11, ymm10 vmovapd YMMWORD PTR [rbp-432], ymm10 vmovapd YMMWORD PTR [rbp-720], ymm11 vsqrtpd ymm11, ymm13 vmulpd ymm13, ymm6, ymm13 vmovapd YMMWORD PTR [rbp-752], ymm11 vpermpd ymm11, ymm6, 255 vsqrtpd ymm9, ymm11 vmovapd YMMWORD PTR [rbp-144], ymm11 vmovapd YMMWORD PTR [rbp-784], ymm9 vmulpd ymm9, ymm6, ymm4 vfnmadd213pd ymm4, ymm7, YMMWORD PTR [rdi+384] vmulpd ymm4, ymm4, ymm15 vfmsub132pd ymm4, ymm9, ymm8 vmulpd ymm5, ymm4, ymm14 vmulpd ymm5, ymm5, ymm8 vpermpd ymm4, ymm5, 0 vsqrtpd ymm11, ymm5 vpermpd ymm10, ymm5, 255 vsqrtpd ymm8, ymm4 vmovapd YMMWORD PTR [rbp-400], ymm4 vmovapd YMMWORD PTR [rbp-816], ymm8 vpermpd ymm8, ymm5, 85 vsqrtpd ymm9, ymm8 vmovapd YMMWORD PTR [rbp-368], ymm8 vsqrtpd ymm8, ymm10 vmulpd ymm10, ymm10, ymm5 vmovapd YMMWORD PTR [rbp-848], ymm9 vpermpd ymm9, ymm5, 170 vsqrtpd ymm4, ymm9 vmovapd YMMWORD PTR [rbp-176], ymm9 vfnmadd213pd ymm12, ymm7, YMMWORD PTR [rdi+192] vmovapd YMMWORD PTR [rbp-912], ymm8 vmulpd ymm8, ymm1, ymm5 vmovapd YMMWORD PTR [rbp-80], ymm14 vmulpd ymm9, ymm6, ymm0 vmovapd YMMWORD PTR [rbp-880], ymm4 vmulpd ymm4, ymm6, ymm1 vfnmadd213pd ymm1, ymm7, YMMWORD PTR [rdi+320] vmulpd ymm1, ymm1, ymm15 vfmsub231pd ymm4, ymm1, ymm3 vmulpd ymm4, ymm4, ymm14 vmovapd ymm14, ymm11 vmovapd YMMWORD PTR [rbp-112], ymm14 vfmsub132pd ymm4, ymm8, ymm3 vmulpd ymm4, ymm4, ymm11 vmulpd ymm4, ymm4, ymm3 vpermpd ymm1, ymm4, 0 vpermpd ymm11, ymm4, 170 vpermpd ymm8, ymm4, 255 vsqrtpd ymm3, ymm1 vmovapd YMMWORD PTR [rbp-336], ymm1 vmovapd YMMWORD PTR [rbp-944], ymm3 vpermpd ymm3, ymm4, 85 vsqrtpd ymm1, ymm3 vmovapd YMMWORD PTR [rbp-304], ymm3 vmulpd ymm3, ymm0, ymm4 vmovapd YMMWORD PTR [rbp-976], ymm1 vsqrtpd ymm1, ymm11 vmulpd ymm11, ymm11, ymm4 vmovapd YMMWORD PTR [rbp-1008], ymm1 vsqrtpd ymm1, ymm8 vmulpd ymm8, ymm8, ymm4 vmovapd YMMWORD PTR [rbp-1040], ymm1 vmulpd ymm1, ymm0, ymm5 vfnmadd213pd ymm0, ymm7, YMMWORD PTR [rdi+256] vmulpd ymm0, ymm0, ymm15 vfmsub231pd ymm9, ymm0, ymm2 vmulpd ymm9, ymm9, YMMWORD PTR [rbp-80] vmulpd ymm0, ymm12, ymm15 vmovapd ymm12, YMMWORD PTR [rbp-1040] vfmsub231pd ymm1, ymm9, ymm2 vmulpd ymm1, ymm1, ymm14 vsqrtpd ymm14, ymm4 vfmsub132pd ymm1, ymm3, ymm2 vmulpd ymm1, ymm1, ymm14 vmulpd ymm1, ymm1, ymm2 vpermpd ymm2, ymm1, 0 vpermpd ymm9, ymm1, 85 vsqrtpd ymm3, ymm2 vmovapd YMMWORD PTR [rbp-272], ymm2 vsqrtpd ymm2, ymm9 vmovapd YMMWORD PTR [rbp-240], ymm9 vpermpd ymm9, ymm1, 170 vmovapd YMMWORD PTR [rbp-1072], ymm3 vsqrtpd ymm3, ymm9 vmovapd YMMWORD PTR [rbp-1104], ymm2 vmulpd ymm9, ymm9, ymm1 vmovapd YMMWORD PTR [rbp-1136], ymm3 vpermpd ymm3, ymm1, 255 vsqrtpd ymm2, ymm3 vmulpd ymm3, ymm3, ymm1 vmovapd YMMWORD PTR [rbp-1168], ymm2 vmulpd ymm2, ymm6, YMMWORD PTR [rbp-144] vfmsub132pd ymm0, ymm2, YMMWORD PTR [rbp-208] vmovapd YMMWORD PTR [rbp-144], ymm14 vmulpd ymm0, ymm0, YMMWORD PTR [rbp-80] vfmsub132pd ymm0, ymm10, YMMWORD PTR [rbp-784] vmulpd ymm0, ymm0, YMMWORD PTR [rbp-112] vfmsub132pd ymm0, ymm8, YMMWORD PTR [rbp-912] vmulpd ymm0, ymm0, ymm14 vsqrtpd ymm14, ymm1 vfmsub132pd ymm12, ymm3, ymm0 vmulpd ymm2, ymm12, ymm14 vmulpd ymm2, ymm2, YMMWORD PTR [rbp-1168] vsqrtpd ymm10, ymm2 vpermpd ymm12, ymm2, 0 vmovapd YMMWORD PTR [rbp-208], ymm12 vmovapd YMMWORD PTR [rbp-784], ymm10 vsqrtpd ymm10, ymm12 vmovapd YMMWORD PTR [rbp-912], ymm10 vpermpd ymm10, ymm2, 85 vsqrtpd ymm8, ymm10 vmulpd ymm10, ymm10, ymm2 vmovapd YMMWORD PTR [rbp-1040], ymm8 vmovapd ymm0, YMMWORD PTR [rbp-496] vpermpd ymm8, ymm2, 170 vfnmadd213pd ymm0, ymm7, YMMWORD PTR [rdi+128] vmulpd ymm3, ymm5, YMMWORD PTR [rbp-176] vsqrtpd ymm12, ymm8 vmovapd YMMWORD PTR [rbp-176], ymm14 vmulpd ymm8, ymm8, ymm2 vmulpd ymm0, ymm0, ymm15 vfmsub132pd ymm0, ymm13, YMMWORD PTR [rbp-656] vmulpd ymm0, ymm0, YMMWORD PTR [rbp-80] vfmsub132pd ymm0, ymm3, YMMWORD PTR [rbp-752] vmovapd ymm3, YMMWORD PTR [rbp-784] vmulpd ymm0, ymm0, YMMWORD PTR [rbp-112] vfmsub132pd ymm0, ymm11, YMMWORD PTR [rbp-880] vmovapd ymm11, YMMWORD PTR [rbp-1136] vmulpd ymm0, ymm0, YMMWORD PTR [rbp-144] vfmsub132pd ymm0, ymm9, YMMWORD PTR [rbp-1008] vmulpd ymm0, ymm0, ymm14 vmulpd ymm14, ymm5, YMMWORD PTR [rbp-368] vfmsub132pd ymm11, ymm8, ymm0 vmulpd ymm0, ymm4, YMMWORD PTR [rbp-304] vmovapd YMMWORD PTR [rbp-304], ymm14 vmulpd ymm3, ymm3, ymm11 vmulpd ymm3, ymm3, ymm12 vmulpd ymm12, ymm1, YMMWORD PTR [rbp-240] vmovapd YMMWORD PTR [rbp-240], ymm0 vmulpd ymm0, ymm6, YMMWORD PTR [rbp-432] vpermpd ymm8, ymm3, 0 vsqrtpd ymm13, ymm3 vpermpd ymm9, ymm3, 85 vsqrtpd ymm11, ymm8 vmovapd ymm14, ymm0 vmovapd ymm0, YMMWORD PTR [rbp-528] vfnmadd213pd ymm0, ymm7, YMMWORD PTR [rdi+64] vmovapd YMMWORD PTR [rbp-496], ymm11 vsqrtpd ymm11, ymm9 vmulpd ymm9, ymm9, ymm3 vmulpd ymm0, ymm0, ymm15 vfmsub132pd ymm0, ymm14, YMMWORD PTR [rbp-624] vmovapd ymm14, YMMWORD PTR [rbp-304] vmulpd ymm0, ymm0, YMMWORD PTR [rbp-80] vfmsub132pd ymm0, ymm14, YMMWORD PTR [rbp-720] vmovapd ymm14, YMMWORD PTR [rbp-240] vmulpd ymm0, ymm0, YMMWORD PTR [rbp-112] vfmsub132pd ymm0, ymm14, YMMWORD PTR [rbp-848] vsqrtpd ymm14, ymm2 vmulpd ymm0, ymm0, YMMWORD PTR [rbp-144] vfmsub132pd ymm0, ymm12, YMMWORD PTR [rbp-976] vmulpd ymm0, ymm0, YMMWORD PTR [rbp-176] vfmsub132pd ymm0, ymm10, YMMWORD PTR [rbp-1104] vmulpd ymm6, ymm6, YMMWORD PTR [rbp-464] vmulpd ymm5, ymm5, YMMWORD PTR [rbp-400] vmulpd ymm4, ymm4, YMMWORD PTR [rbp-336] vmulpd ymm1, ymm1, YMMWORD PTR [rbp-272] vmulpd ymm0, ymm0, ymm14 vfmsub132pd ymm0, ymm9, YMMWORD PTR [rbp-1040] vmulpd ymm2, ymm2, YMMWORD PTR [rbp-208] vmulpd ymm3, ymm8, ymm3 vmulpd ymm0, ymm0, ymm13 vmulpd ymm11, ymm0, ymm11 vpermpd ymm0, ymm11, 0 vsqrtpd ymm12, ymm11 vmulpd ymm11, ymm0, ymm11 vsqrtpd ymm10, ymm0 vmovapd ymm0, YMMWORD PTR [rbp-560] vfnmadd213pd ymm7, ymm0, YMMWORD PTR [rdi] vmulpd ymm7, ymm7, ymm15 vfmsub132pd ymm7, ymm6, YMMWORD PTR [rbp-592] vmulpd ymm7, ymm7, YMMWORD PTR [rbp-80] vfmsub132pd ymm7, ymm5, YMMWORD PTR [rbp-688] vmulpd ymm7, ymm7, YMMWORD PTR [rbp-112] vfmsub132pd ymm7, ymm4, YMMWORD PTR [rbp-816] vmulpd ymm0, ymm7, YMMWORD PTR [rbp-144] vfmsub132pd ymm0, ymm1, YMMWORD PTR [rbp-944] vmulpd ymm0, ymm0, YMMWORD PTR [rbp-176] vfmsub132pd ymm0, ymm2, YMMWORD PTR [rbp-1072] vmulpd ymm0, ymm0, ymm14 vfmsub132pd ymm0, ymm3, YMMWORD PTR [rbp-912] vmulpd ymm0, ymm0, ymm13 vfmsub132pd ymm0, ymm11, YMMWORD PTR [rbp-496] vmulpd ymm0, ymm0, ymm12 vmulpd ymm0, ymm0, ymm10 vzeroupper add rsp, 1040 pop r10 pop rbp lea rsp, [r10-8] ret [/out]