gcc/ChangeLog: 2017-04-26 Robin Dapp <rd...@linux.vnet.ibm.com>
* tree-vect-data-refs.c (vect_peeling_hash_get_lowest_cost): Change cost model. (vect_peeling_hash_choose_best_peeling): Return extended peel info. (vect_peeling_supportable): Return peeling status.
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index 7b68582..da49e35 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -904,9 +904,9 @@ vect_compute_data_ref_alignment (struct data_reference *dr) /* Function vect_update_misalignment_for_peel. - Sets DR's misalignment + Set DR's misalignment - to 0 if it has the same alignment as DR_PEEL, - - to the misalignment computed using NPEEL if DR's salignment is known, + - to the misalignment computed using NPEEL if DR's misalignment is known, - to -1 (unknown) otherwise. DR - the data reference whose misalignment is to be adjusted. @@ -1293,7 +1293,7 @@ vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, { vect_peel_info elem = *slot; int dummy; - unsigned int inside_cost = 0, outside_cost = 0, i; + unsigned int inside_cost = 0, outside_cost = 0; gimple *stmt = DR_STMT (elem->dr); stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); @@ -1342,7 +1342,7 @@ vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, choosing an option with the lowest cost (if cost model is enabled) or the option that aligns as many accesses as possible. */ -static struct data_reference * +static struct _vect_peel_extended_info vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, loop_vec_info loop_vinfo, unsigned int *npeel, @@ -1369,7 +1369,7 @@ vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_hta *npeel = res.peel_info.npeel; *body_cost_vec = res.body_cost_vec; - return res.peel_info.dr; + return res; } /* Return true if the new peeling NPEEL is supported. */ @@ -1518,7 +1518,11 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum dr_alignment_support supportable_dr_alignment; - struct data_reference *dr0 = NULL, *first_store = NULL; + + struct data_reference *most_frequent_read = NULL; + unsigned int dr_read_count = 0; + struct data_reference *most_frequent_write = NULL; + unsigned int dr_write_count = 0; struct data_reference *dr; unsigned int i, j; bool do_peeling = false; @@ -1527,11 +1531,13 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) gimple *stmt; stmt_vec_info stmt_info; unsigned int npeel = 0; - bool all_misalignments_unknown = true; + bool one_misalignment_known = false; + bool one_misalignment_unknown = false; + unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); unsigned possible_npeel_number = 1; tree vectype; - unsigned int nelements, mis, same_align_drs_max = 0; + unsigned int nelements, mis; stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost (); hash_table<peel_info_hasher> peeling_htab (1); @@ -1652,57 +1658,67 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) npeel_tmp += nelements; } - all_misalignments_unknown = false; - /* Data-ref that was chosen for the case that all the - misalignments are unknown is not relevant anymore, since we - have a data-ref with known alignment. */ - dr0 = NULL; + one_misalignment_known = true; } else { - /* If we don't know any misalignment values, we prefer - peeling for data-ref that has the maximum number of data-refs - with the same alignment, unless the target prefers to align - stores over load. */ - if (all_misalignments_unknown) - { - unsigned same_align_drs - = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); - if (!dr0 - || same_align_drs_max < same_align_drs) - { - same_align_drs_max = same_align_drs; - dr0 = dr; - } - /* For data-refs with the same number of related - accesses prefer the one where the misalign - computation will be invariant in the outermost loop. */ - else if (same_align_drs_max == same_align_drs) + /* If we don't know any misalignment values, we prefer + peeling for data-ref that has the maximum number of data-refs + with the same alignment, unless the target prefers to align + stores over load. */ + unsigned same_align_dr_count + = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); + + /* For data-refs with the same number of related + accesses prefer the one where the misalign + computation will be invariant in the outermost loop. */ + struct loop *ivloop_max_read = NULL, *ivloop_max_write = NULL, + *ivloop_dr = NULL; + if (most_frequent_read) + ivloop_max_read = outermost_invariant_loop_for_expr + (loop, DR_BASE_ADDRESS (most_frequent_read)); + if (most_frequent_write) + ivloop_max_write = outermost_invariant_loop_for_expr + (loop, DR_BASE_ADDRESS (most_frequent_write)); + ivloop_dr = outermost_invariant_loop_for_expr + (loop, DR_BASE_ADDRESS (dr)); + + if (DR_IS_READ (dr)) + { + if (!most_frequent_read + || (same_align_dr_count > dr_read_count)) { - struct loop *ivloop0, *ivloop; - ivloop0 = outermost_invariant_loop_for_expr - (loop, DR_BASE_ADDRESS (dr0)); - ivloop = outermost_invariant_loop_for_expr - (loop, DR_BASE_ADDRESS (dr)); - if ((ivloop && !ivloop0) - || (ivloop && ivloop0 - && flow_loop_nested_p (ivloop, ivloop0))) - dr0 = dr; + most_frequent_read = dr; + dr_read_count = same_align_dr_count; } + else if (same_align_dr_count == dr_read_count + && ((ivloop_dr && !ivloop_max_read) + || (ivloop_dr && ivloop_max_read + && flow_loop_nested_p + (ivloop_dr, ivloop_max_read)))) + { + most_frequent_read = dr; + } + } + else if (DR_IS_WRITE (dr)) + { + if (!most_frequent_write + || (same_align_dr_count > dr_write_count)) + { + most_frequent_write = dr; + dr_write_count = same_align_dr_count; + } + else if (same_align_dr_count == dr_write_count + && ((ivloop_dr && !ivloop_max_write) + || (ivloop_dr && ivloop_max_write + && flow_loop_nested_p + (ivloop_dr, ivloop_max_write)))) + { + most_frequent_write = dr; + } + } - if (!first_store && DR_IS_WRITE (dr)) - first_store = dr; - } - - /* If there are both known and unknown misaligned accesses in the - loop, we choose peeling amount according to the known - accesses. */ - if (!supportable_dr_alignment) - { - dr0 = dr; - if (!first_store && DR_IS_WRITE (dr)) - first_store = dr; - } + one_misalignment_unknown = true; } } else @@ -1723,111 +1739,207 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) || loop->inner) do_peeling = false; - if (do_peeling - && all_misalignments_unknown - && vect_supportable_dr_alignment (dr0, false)) + struct _vect_peel_extended_info best_peel; + struct _vect_peel_extended_info peel_for_known_alignment; + peel_for_known_alignment.peel_info.dr = NULL; + struct _vect_peel_extended_info peel_for_unknown_alignment; + peel_for_unknown_alignment.peel_info.dr = NULL; + + if (do_peeling && one_misalignment_known) { - /* Check if the target requires to prefer stores over loads, i.e., if - misaligned stores are more expensive than misaligned loads (taking - drs with same alignment into account). */ - if (first_store && DR_IS_READ (dr0)) - { - unsigned int load_inside_cost = 0, load_outside_cost = 0; - unsigned int store_inside_cost = 0, store_outside_cost = 0; - unsigned int load_inside_penalty = 0, load_outside_penalty = 0; - unsigned int store_inside_penalty = 0, store_outside_penalty = 0; + /* Peeling is possible, but there is no data access that is not supported + unless aligned. So we try to choose the best possible peeling. */ + + /* Choose the best peeling from the hash table. */ + peel_for_known_alignment = vect_peeling_hash_choose_best_peeling + (&peeling_htab, loop_vinfo, &npeel, &body_cost_vec); + } + + if (do_peeling && one_misalignment_unknown) + { + /* Calculate the costs for aligning MOST_FREQUENT_READ, potentially + leaving everything else misaligned. */ + unsigned int align_mf_read_inside_cost = 0; + unsigned int align_mf_read_outside_cost = 0; + + if (most_frequent_read) + { stmt_vector_for_cost dummy; dummy.create (2); + vect_get_peeling_costs_all_drs (most_frequent_read, + &align_mf_read_inside_cost, + &align_mf_read_outside_cost, + &dummy, vf / 2, vf); + dummy.release(); + } + else + { + align_mf_read_inside_cost = UINT_MAX; + align_mf_read_outside_cost = UINT_MAX; + } - vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost, - &dummy); - vect_get_data_access_cost (first_store, &store_inside_cost, - &store_outside_cost, &dummy); + /* Calculate the costs for aligning MOST_FREQUENT_WRITE, potentially + leaving everything else misaligned. */ + unsigned int align_mf_write_inside_cost = 0; + unsigned int align_mf_write_outside_cost = 0; + if (most_frequent_write) + { + stmt_vector_for_cost dummy; + dummy.create (2); + vect_get_peeling_costs_all_drs (most_frequent_write, + &align_mf_write_inside_cost, + &align_mf_write_outside_cost, + &dummy, vf / 2, vf); dummy.release (); + } + else + { + align_mf_write_inside_cost = UINT_MAX; + align_mf_write_outside_cost = UINT_MAX; + } - /* Calculate the penalty for leaving FIRST_STORE unaligned (by - aligning the load DR0). */ - load_inside_penalty = store_inside_cost; - load_outside_penalty = store_outside_cost; - for (i = 0; - STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( - DR_STMT (first_store))).iterate (i, &dr); - i++) - if (DR_IS_READ (dr)) - { - load_inside_penalty += load_inside_cost; - load_outside_penalty += load_outside_cost; - } - else - { - load_inside_penalty += store_inside_cost; - load_outside_penalty += store_outside_cost; - } - - /* Calculate the penalty for leaving DR0 unaligned (by - aligning the FIRST_STORE). */ - store_inside_penalty = load_inside_cost; - store_outside_penalty = load_outside_cost; - for (i = 0; - STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( - DR_STMT (dr0))).iterate (i, &dr); - i++) - if (DR_IS_READ (dr)) - { - store_inside_penalty += load_inside_cost; - store_outside_penalty += load_outside_cost; - } - else - { - store_inside_penalty += store_inside_cost; - store_outside_penalty += store_outside_cost; - } - - if (load_inside_penalty > store_inside_penalty - || (load_inside_penalty == store_inside_penalty - && load_outside_penalty > store_outside_penalty)) - dr0 = first_store; - } + /* Choose best peeling according to given load and store peeling + costs. */ + if (align_mf_read_inside_cost > align_mf_write_inside_cost + || (align_mf_read_inside_cost == align_mf_write_inside_cost + && align_mf_read_outside_cost > align_mf_write_outside_cost)) + { + peel_for_unknown_alignment.peel_info.dr = most_frequent_write; + peel_for_unknown_alignment.peel_info.count = + 1 + STMT_VINFO_SAME_ALIGN_REFS + (vinfo_for_stmt (DR_STMT (most_frequent_write))).length (); + peel_for_unknown_alignment.inside_cost = align_mf_write_inside_cost; + peel_for_unknown_alignment.outside_cost = + align_mf_write_outside_cost; + } + else + { + if (most_frequent_read) + { + peel_for_unknown_alignment.peel_info.dr = most_frequent_read; + peel_for_unknown_alignment.peel_info.count = + 1 + STMT_VINFO_SAME_ALIGN_REFS + (vinfo_for_stmt (DR_STMT (most_frequent_read))).length (); + } + else + { + peel_for_unknown_alignment.peel_info.dr = most_frequent_write; + peel_for_unknown_alignment.peel_info.count = + 1 + STMT_VINFO_SAME_ALIGN_REFS + (vinfo_for_stmt (DR_STMT (most_frequent_write))).length (); + } + peel_for_unknown_alignment.inside_cost = align_mf_read_inside_cost; + peel_for_unknown_alignment.outside_cost = + align_mf_read_outside_cost; + } + + /* Add prologue and epilogue costs. */ + stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; + prologue_cost_vec.create (2); + epilogue_cost_vec.create (2); + + int dummy; + peel_for_unknown_alignment.outside_cost + += vect_get_known_peeling_cost (loop_vinfo, vf / 2, &dummy, + &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), + &prologue_cost_vec, &epilogue_cost_vec); - /* In case there are only loads with different unknown misalignments, use - peeling only if it may help to align other accesses in the loop or - if it may help improving load bandwith when we'd end up using - unaligned loads. */ - tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0))); - if (!first_store - && !STMT_VINFO_SAME_ALIGN_REFS ( - vinfo_for_stmt (DR_STMT (dr0))).length () - && (vect_supportable_dr_alignment (dr0, false) - != dr_unaligned_supported - || (builtin_vectorization_cost (vector_load, dr0_vt, 0) - == builtin_vectorization_cost (unaligned_load, dr0_vt, -1)))) - do_peeling = false; + prologue_cost_vec.release (); + epilogue_cost_vec.release (); + + /* The code below expects npeel == 0 when we plan to peel vf/2 + iterations, so do not set npeel = vf/2 here. */ + peel_for_unknown_alignment.peel_info.npeel = 0; } - if (do_peeling && !dr0) + /* At this point, we have to choose between peeling for the datarefs with + known alignment and the ones with unknown alignment. Prefer the one + that aligns more datarefs in total. */ + struct data_reference *dr0 = NULL; + if (do_peeling) { - /* Peeling is possible, but there is no data access that is not supported - unless aligned. So we try to choose the best possible peeling. */ + bool peel_for_unknown_alignment_valid = + peel_for_unknown_alignment.peel_info.dr != NULL; + bool peel_for_known_alignment_valid = + peel_for_known_alignment.peel_info.dr != NULL; - /* We should get here only if there are drs with known misalignment. */ - gcc_assert (!all_misalignments_unknown); + gcc_assert (peel_for_known_alignment_valid + || peel_for_unknown_alignment_valid); - /* Choose the best peeling from the hash table. */ - dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab, - loop_vinfo, &npeel, - &body_cost_vec); - if (!dr0 || !npeel) - do_peeling = false; + if (peel_for_known_alignment_valid && !peel_for_unknown_alignment_valid) + best_peel = peel_for_known_alignment; + + else if + (!peel_for_known_alignment_valid && peel_for_unknown_alignment_valid) + best_peel = peel_for_unknown_alignment; + + else + { + /* Choose the best peeling for known and unknown alignment + according to the number of aligned datarefs. */ + if (peel_for_unknown_alignment.peel_info.count + > peel_for_known_alignment.peel_info.count) + best_peel = peel_for_unknown_alignment; + else + best_peel = peel_for_known_alignment; + } } + /* Calculate the penalty for no peeling, i.e. leaving everything + unaligned. + TODO: use something like an adapted vect_get_peeling_costs_all_drs. */ + unsigned nopeel_inside_cost = 0; + unsigned nopeel_outside_cost = 0; + + stmt_vector_for_cost dummy; + dummy.create (2); + FOR_EACH_VEC_ELT (datarefs, i, dr) + { + vect_get_data_access_cost (dr, &nopeel_inside_cost, + &nopeel_outside_cost, &dummy); + } + dummy.release (); + + /* Add epilogue costs. As we do no peeling for alignment here, no prologue + costs will be recorded. */ + stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; + prologue_cost_vec.create (2); + epilogue_cost_vec.create (2); + + int dummy2; + nopeel_outside_cost += vect_get_known_peeling_cost + (loop_vinfo, 0, &dummy2, + &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), + &prologue_cost_vec, &epilogue_cost_vec); + + prologue_cost_vec.release (); + epilogue_cost_vec.release (); + + + /* Check if doing no peeling is not more expensive than the best peeling we + have so far. */ + if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) + && ((nopeel_inside_cost < best_peel.inside_cost) + || (nopeel_inside_cost == best_peel.inside_cost + && nopeel_outside_cost <= best_peel.outside_cost))) + { + do_peeling = false; + npeel = 0; + } + + if (do_peeling) { + dr0 = best_peel.peel_info.dr; + npeel = best_peel.peel_info.npeel; + stmt = DR_STMT (dr0); stmt_info = vinfo_for_stmt (stmt); vectype = STMT_VINFO_VECTYPE (stmt_info); nelements = TYPE_VECTOR_SUBPARTS (vectype); + /* Define a peeling if not already set and log it. */ if (known_alignment_for_access_p (dr0)) { bool negative = DR_HAS_NEGATIVE_STEP (dr0);