The following fixes an old missed basic-block vectorization issue exposing itself as regression caused by a x86 cost change lumping a lot more code into the same BB.
We are sorting DRs after constant offset and if there are multiple refs with the same offset we break the DR group. This doesn't handle things like a[0] = ..; a[1] = ..; ... a[0] = ..; a[1] = ..; very well (read: not). The temporary fix for GCC 8 to solve the fma3d regression (and avoid a STLF issue) is to not break groups at such point but simply ignore the duplicates we run into for group construction so only the first group in a BB with exact duplicates will be identified. A more elaborate fix isn't suitable at this stage IMHO (I suggested how to do it in the comment but didn't try yet to see how complicated it would end up). Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. Richard. 2017-12-01 Richard Biener <rguent...@suse.de> PR tree-optimization/83232 * tree-vect-data-refs.c (vect_analyze_data_ref_accesses): Fix detection of same access. Instead of breaking the group here do not consider the duplicate. Add comment explaining real fix. * gfortran.dg/vect/pr83232.f90: New testcase. Index: gcc/tree-vect-data-refs.c =================================================================== --- gcc/tree-vect-data-refs.c (revision 255300) +++ gcc/tree-vect-data-refs.c (working copy) @@ -2841,10 +2841,6 @@ vect_analyze_data_ref_accesses (vec_info if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0) break; - /* Do not place the same access in the interleaving chain twice. */ - if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0) - break; - /* Check the types are compatible. ??? We don't distinguish this during sorting. */ if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), @@ -2854,7 +2850,25 @@ vect_analyze_data_ref_accesses (vec_info /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); - gcc_assert (init_a <= init_b); + HOST_WIDE_INT init_prev + = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1])); + gcc_assert (init_a <= init_b + && init_a <= init_prev + && init_prev <= init_b); + + /* Do not place the same access in the interleaving chain twice. */ + if (init_b == init_prev) + { + gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1])) + < gimple_uid (DR_STMT (drb))); + /* ??? For now we simply "drop" the later reference which is + otherwise the same rather than finishing off this group. + In the end we'd want to re-process duplicates forming + multiple groups from the refs, likely by just collecting + all candidates (including duplicates and split points + below) in a vector and then process them together. */ + continue; + } /* If init_b == init_a + the size of the type * k, we have an interleaving, and DRA is accessed before DRB. */ @@ -2866,10 +2880,7 @@ vect_analyze_data_ref_accesses (vec_info /* If we have a store, the accesses are adjacent. This splits groups into chunks we support (we don't support vectorization of stores with gaps). */ - if (!DR_IS_READ (dra) - && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW - (DR_INIT (datarefs_copy[i-1])) - != type_size_a)) + if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a) break; /* If the step (if not zero or non-constant) is greater than the Index: gcc/testsuite/gfortran.dg/vect/pr83232.f90 =================================================================== --- gcc/testsuite/gfortran.dg/vect/pr83232.f90 (nonexistent) +++ gcc/testsuite/gfortran.dg/vect/pr83232.f90 (working copy) @@ -0,0 +1,33 @@ +! { dg-do compile } +! { dg-require-effective-target vect_double } +! { dg-additional-options "-funroll-loops --param vect-max-peeling-for-alignment=0 -fdump-tree-slp-details" } + + SUBROUTINE MATERIAL_41_INTEGRATION ( STRESS,YLDC,EFPS, & + & DTnext,Dxx,Dyy,Dzz,Dxy,Dxz,Dyz,MatID,P1,P3 ) + REAL(KIND(0D0)), INTENT(INOUT) :: STRESS(6) + REAL(KIND(0D0)), INTENT(IN) :: DTnext + REAL(KIND(0D0)), INTENT(IN) :: Dxx,Dyy,Dzz,Dxy,Dxz,Dyz + REAL(KIND(0D0)) :: Einc(6) + REAL(KIND(0D0)) :: P1,P3 + + Einc(1) = DTnext * Dxx ! (1) + Einc(2) = DTnext * Dyy + Einc(3) = DTnext * Dzz + Einc(4) = DTnext * Dxy + Einc(5) = DTnext * Dxz + Einc(6) = DTnext * Dyz + DO i = 1,6 + STRESS(i) = STRESS(i) + P3*Einc(i) + ENDDO + STRESS(1) = STRESS(1) + (DTnext * P1 * (Dxx+Dyy+Dzz)) ! (2) + STRESS(2) = STRESS(2) + (DTnext * P1 * (Dxx+Dyy+Dzz)) + STRESS(3) = 0.0 + Einc(5) = 0.0 ! (3) + Einc(6) = 0.0 + call foo (Einc) + END SUBROUTINE + +! We should vectorize (1) and (2) +! { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "slp1" } } +! We fail to vectorize at (3), this can be fixed in the future +! { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "slp1" { xfail *-*-* } } }