This is a vectorizer patch I inherited from Ira. As with the fix for PR50969, it handles problems with excessive permutes by adjusting the cost model. In this case the permutes comes from type promotion/demotion, which is now modeled with a new vec_cvt cost.
This restores the lost performance for sphinx3, and gives wupwise an additional boost. This is a performance workaround for 4.7; for 4.8 we want to try to do a better job of modeling the real issue, which is that VSX permutes are constrained to a single processor pipe. Thus isolated permutes are less expensive per instruction than denser collections of permutes. There also remains an opportunity for versioning loops for alignment. Bootstrapped and tested on powerpc64-linux with no regressions. OK for trunk? Thanks, Bill 2012-02-08 Bill Schmidt <wschm...@linux.vnet.ibm.com> Ira Rosen <i...@il.ibm.com> PR tree-optimization/50031 * targhooks.c (default_builtin_vectorization_cost): Handle vec_cvt. * target.h (enum vect_cost_for_stmt): Add vec_cvt. * tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle all types of reduction and pattern statements. (vect_estimate_min_profitable_iters): Likewise. * tree-vect-stmts.c (vect_model_simple_cost): Use vec_cvt cost for type promotions and demotions. (vect_model_conversion_cost): New function. (vect_get_load_cost): Use vec_perm for permutations; add dump logic for explicit realigns. (vectorizable_conversion): Call vect_model_conversion_cost. * config/spu/spu.c (spu_builtin_vectorization_cost): Handle vec_cvt. * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise. * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update vec_perm for VSX and handle vec_cvt. Index: gcc/targhooks.c =================================================================== --- gcc/targhooks.c (revision 183944) +++ gcc/targhooks.c (working copy) @@ -514,6 +514,7 @@ default_builtin_vectorization_cost (enum vect_cost case scalar_to_vec: case cond_branch_not_taken: case vec_perm: + case vec_cvt: return 1; case unaligned_load: Index: gcc/target.h =================================================================== --- gcc/target.h (revision 183944) +++ gcc/target.h (working copy) @@ -145,7 +145,8 @@ enum vect_cost_for_stmt scalar_to_vec, cond_branch_not_taken, cond_branch_taken, - vec_perm + vec_perm, + vec_cvt }; /* The target structure. This holds all the backend hooks. */ Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c (revision 183944) +++ gcc/tree-vect-loop.c (working copy) @@ -2417,7 +2417,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf if (stmt_info && !STMT_VINFO_RELEVANT_P (stmt_info) && (!STMT_VINFO_LIVE_P (stmt_info) - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) + && !STMT_VINFO_IN_PATTERN_P (stmt_info)) continue; if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) @@ -2562,17 +2563,51 @@ vect_estimate_min_profitable_iters (loop_vec_info for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) { - gimple stmt = gsi_stmt (si); + gimple stmt = gsi_stmt (si), pattern_stmt; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); /* Skip stmts that are not vectorized inside the loop. */ if (!STMT_VINFO_RELEVANT_P (stmt_info) && (!STMT_VINFO_LIVE_P (stmt_info) - || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) - continue; + || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))) + { + if (STMT_VINFO_IN_PATTERN_P (stmt_info) + && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) + && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) + || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) + { + stmt = pattern_stmt; + stmt_info = vinfo_for_stmt (stmt); + } + else + continue; + } vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; /* FIXME: for stmts in the inner-loop in outer-loop vectorization, some of the "outside" costs are generated inside the outer-loop. */ vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); + if (is_pattern_stmt_p (stmt_info) + && STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)) + { + gimple_stmt_iterator gsi; + + for (gsi = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple pattern_def_stmt = gsi_stmt (gsi); + stmt_vec_info pattern_def_stmt_info + = vinfo_for_stmt (pattern_def_stmt); + if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) + || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) + { + vec_inside_cost + += STMT_VINFO_INSIDE_OF_LOOP_COST + (pattern_def_stmt_info) * factor; + vec_outside_cost + += STMT_VINFO_OUTSIDE_OF_LOOP_COST + (pattern_def_stmt_info); + } + } + } } } Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c (revision 183944) +++ gcc/tree-vect-stmts.c (working copy) @@ -787,13 +787,18 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i { int i; int inside_cost = 0, outside_cost = 0; + enum vect_cost_for_stmt cost = vector_stmt; /* The SLP costs were already calculated during SLP tree build. */ if (PURE_SLP_STMT (stmt_info)) return; - inside_cost = ncopies * vect_get_stmt_cost (vector_stmt); + if (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type + || STMT_VINFO_TYPE (stmt_info) == type_demotion_vec_info_type) + cost = vec_cvt; + inside_cost = ncopies * vect_get_stmt_cost (cost); + /* FORNOW: Assuming maximum 2 args per stmts. */ for (i = 0; i < 2; i++) { @@ -811,6 +816,43 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i } +/* Model cost for type demotion and promotion operations. */ + +static void +vect_model_conversion_cost (stmt_vec_info stmt_info, + enum vect_def_type *dt, int pwr) +{ + int i, tmp; + int inside_cost = 0, outside_cost = 0, single_stmt_cost; + + /* The SLP costs were already calculated during SLP tree build. */ + if (PURE_SLP_STMT (stmt_info)) + return; + + single_stmt_cost = vect_get_stmt_cost (vec_cvt); + for (i = 0; i < pwr + 1; i++) + { + tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ? + (i + 1) : i; + inside_cost += vect_pow2 (tmp) * single_stmt_cost; + } + + /* FORNOW: Assuming maximum 2 args per stmts. */ + for (i = 0; i < 2; i++) + { + if (dt[i] == vect_constant_def || dt[i] == vect_external_def) + outside_cost += vect_get_stmt_cost (vector_stmt); + } + + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, "vect_model_conversion_cost: inside_cost = %d, " + "outside_cost = %d .", inside_cost, outside_cost); + + /* Set the costs in STMT_INFO. */ + stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost); + stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost); +} + /* Function vect_cost_strided_group_size For strided load or store, return the group_size only if it is the first @@ -887,7 +929,6 @@ vect_model_store_cost (stmt_vec_info stmt_info, in if (vect_print_dump_info (REPORT_COST)) fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .", group_size); - } /* Costs of the stores. */ @@ -1049,7 +1090,7 @@ vect_get_load_cost (struct data_reference *dr, int case dr_explicit_realign: { *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load) - + vect_get_stmt_cost (vector_stmt)); + + vect_get_stmt_cost (vec_perm)); /* FIXME: If the misalignment remains fixed across the iterations of the containing loop, the following cost should be added to the @@ -1057,6 +1098,9 @@ vect_get_load_cost (struct data_reference *dr, int if (targetm.vectorize.builtin_mask_for_load) *inside_cost += vect_get_stmt_cost (vector_stmt); + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, "vect_model_load_cost: explicit realign"); + break; } case dr_explicit_realign_optimized: @@ -1080,7 +1124,7 @@ vect_get_load_cost (struct data_reference *dr, int } *inside_cost += ncopies * (vect_get_stmt_cost (vector_load) - + vect_get_stmt_cost (vector_stmt)); + + vect_get_stmt_cost (vec_perm)); break; } @@ -2392,16 +2436,19 @@ vectorizable_conversion (gimple stmt, gimple_stmt_ if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vectorizable_conversion ==="); if (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR) - STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; + { + STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; + vect_model_simple_cost (stmt_info, ncopies, dt, NULL); + } else if (modifier == NARROW) { STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; - vect_model_simple_cost (stmt_info, ncopies, dt, NULL); + vect_model_conversion_cost (stmt_info, dt, multi_step_cvt); } else { STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; - vect_model_simple_cost (stmt_info, 2 * ncopies, dt, NULL); + vect_model_conversion_cost (stmt_info, dt, multi_step_cvt); } VEC_free (tree, heap, interm_types); return true; Index: gcc/config/spu/spu.c =================================================================== --- gcc/config/spu/spu.c (revision 183944) +++ gcc/config/spu/spu.c (working copy) @@ -6920,6 +6920,7 @@ spu_builtin_vectorization_cost (enum vect_cost_for case scalar_to_vec: case cond_branch_not_taken: case vec_perm: + case vec_cvt: return 1; case scalar_store: Index: gcc/config/i386/i386.c =================================================================== --- gcc/config/i386/i386.c (revision 183944) +++ gcc/config/i386/i386.c (working copy) @@ -35336,6 +35336,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo return ix86_cost->cond_not_taken_branch_cost; case vec_perm: + case vec_cvt: return ix86_cost->vec_stmt_cost; default: Index: gcc/config/rs6000/rs6000.c =================================================================== --- gcc/config/rs6000/rs6000.c (revision 183944) +++ gcc/config/rs6000/rs6000.c (working copy) @@ -3543,10 +3543,17 @@ rs6000_builtin_vectorization_cost (enum vect_cost_ return 1; case vec_perm: - if (!TARGET_VSX) + if (TARGET_VSX) + return 4; + else return 1; - return 2; + case vec_cvt: + if (TARGET_VSX) + return 5; + else + return 1; + case cond_branch_taken: return 3;