On Thu, 17 Dec 2020, Jakub Jelinek wrote: > On Wed, Dec 16, 2020 at 09:29:31AM +0100, Richard Biener wrote: > > I think it probably makes sense to have some helper split out that > > collects & classifies vector constructor components we can use from > > both forwprop (where matching the V_C_E from integer could be done > > as well IMHO) and bswap (when a permute is involved) and store-merging. > > I've tried to add such helper, but handling over just analysis and letting > each pass handle it differently seems complicated given the limitations of > the bswap infrastructure. > > So, this patch just hooks the optimization also into store-merging so that > the original testcase from the PR can be fixed. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
OK, but ... > Note, I have no idea why the bswap code needs TODO_update_ssa if it changed > things, for the vuses it copies them from the surrounding vuses, which looks > correct to me. Perhaps because it uses force_gimple_operand_gsi* in a few > spots in bswap_replace? Confused... .. that shouldn't cause updating SSA to be necessary. Maybe it at some point did not update virtual operands appropriately. Richard. > 2020-12-17 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/96239 > * gimple-ssa-store-merging.c (maybe_optimize_vector_constructor): New > function. > (get_status_for_store_merging): Don't return BB_INVALID for blocks > with potential bswap optimizable CONSTRUCTORs. > (pass_store_merging::execute): Optimize vector CONSTRUCTORs with bswap > if possible. > > * gcc.dg/tree-ssa/pr96239.c: New test. > > --- gcc/gimple-ssa-store-merging.c.jj 2020-12-16 13:07:51.729733816 +0100 > +++ gcc/gimple-ssa-store-merging.c 2020-12-16 16:02:06.238868137 +0100 > @@ -1255,6 +1255,75 @@ bswap_replace (gimple_stmt_iterator gsi, > return tgt; > } > > +/* Try to optimize an assignment CUR_STMT with CONSTRUCTOR on the rhs > + using bswap optimizations. CDI_DOMINATORS need to be > + computed on entry. Return true if it has been optimized and > + TODO_update_ssa is needed. */ > + > +static bool > +maybe_optimize_vector_constructor (gimple *cur_stmt) > +{ > + tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type; > + struct symbolic_number n; > + bool bswap; > + > + gcc_assert (is_gimple_assign (cur_stmt) > + && gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR); > + > + tree rhs = gimple_assign_rhs1 (cur_stmt); > + if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) > + || !INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))) > + || gimple_assign_lhs (cur_stmt) == NULL_TREE) > + return false; > + > + HOST_WIDE_INT sz = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT; > + switch (sz) > + { > + case 16: > + load_type = bswap_type = uint16_type_node; > + break; > + case 32: > + if (builtin_decl_explicit_p (BUILT_IN_BSWAP32) > + && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing) > + { > + load_type = uint32_type_node; > + fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32); > + bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); > + } > + else > + return false; > + break; > + case 64: > + if (builtin_decl_explicit_p (BUILT_IN_BSWAP64) > + && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing > + || (word_mode == SImode > + && builtin_decl_explicit_p (BUILT_IN_BSWAP32) > + && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing))) > + { > + load_type = uint64_type_node; > + fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64); > + bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); > + } > + else > + return false; > + break; > + default: > + return false; > + } > + > + gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap); > + if (!ins_stmt || n.range != (unsigned HOST_WIDE_INT) sz) > + return false; > + > + if (bswap && !fndecl && n.range != 16) > + return false; > + > + memset (&nop_stats, 0, sizeof (nop_stats)); > + memset (&bswap_stats, 0, sizeof (bswap_stats)); > + return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, > + bswap_type, load_type, &n, bswap) != NULL_TREE; > +} > + > /* Find manual byte swap implementations as well as load in a given > endianness. Byte swaps are turned into a bswap builtin invokation > while endian loads are converted to bswap builtin invokation or > @@ -5126,6 +5195,7 @@ static enum basic_block_status > get_status_for_store_merging (basic_block bb) > { > unsigned int num_statements = 0; > + unsigned int num_constructors = 0; > gimple_stmt_iterator gsi; > edge e; > > @@ -5138,9 +5208,27 @@ get_status_for_store_merging (basic_bloc > > if (store_valid_for_store_merging_p (stmt) && ++num_statements >= 2) > break; > + > + if (is_gimple_assign (stmt) > + && gimple_assign_rhs_code (stmt) == CONSTRUCTOR) > + { > + tree rhs = gimple_assign_rhs1 (stmt); > + if (VECTOR_TYPE_P (TREE_TYPE (rhs)) > + && INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))) > + && gimple_assign_lhs (stmt) != NULL_TREE) > + { > + HOST_WIDE_INT sz > + = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT; > + if (sz == 16 || sz == 32 || sz == 64) > + { > + num_constructors = 1; > + break; > + } > + } > + } > } > > - if (num_statements == 0) > + if (num_statements == 0 && num_constructors == 0) > return BB_INVALID; > > if (cfun->can_throw_non_call_exceptions && cfun->eh > @@ -5149,7 +5237,7 @@ get_status_for_store_merging (basic_bloc > && e->dest == bb->next_bb) > return BB_EXTENDED_VALID; > > - return num_statements >= 2 ? BB_VALID : BB_INVALID; > + return (num_statements >= 2 || num_constructors) ? BB_VALID : BB_INVALID; > } > > /* Entry point for the pass. Go over each basic block recording chains of > @@ -5163,6 +5251,7 @@ pass_store_merging::execute (function *f > basic_block bb; > hash_set<gimple *> orig_stmts; > bool changed = false, open_chains = false; > + unsigned todo = 0; > > /* If the function can throw and catch non-call exceptions, we'll be trying > to merge stores across different basic blocks so we need to first > unsplit > @@ -5189,9 +5278,10 @@ pass_store_merging::execute (function *f > if (dump_file && (dump_flags & TDF_DETAILS)) > fprintf (dump_file, "Processing basic block <%d>:\n", bb->index); > > - for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); ) > { > gimple *stmt = gsi_stmt (gsi); > + gsi_next (&gsi); > > if (is_gimple_debug (stmt)) > continue; > @@ -5207,6 +5297,14 @@ pass_store_merging::execute (function *f > continue; > } > > + if (is_gimple_assign (stmt) > + && gimple_assign_rhs_code (stmt) == CONSTRUCTOR > + && maybe_optimize_vector_constructor (stmt)) > + { > + todo = TODO_update_ssa; > + continue; > + } > + > if (store_valid_for_store_merging_p (stmt)) > changed |= process_store (stmt); > else > @@ -5230,10 +5328,10 @@ pass_store_merging::execute (function *f > if (cfun->can_throw_non_call_exceptions && cfun->eh && changed) > { > free_dominance_info (CDI_DOMINATORS); > - return TODO_cleanup_cfg; > + return TODO_cleanup_cfg | todo; > } > > - return 0; > + return todo; > } > > } // anon namespace > --- gcc/testsuite/gcc.dg/tree-ssa/pr96239.c.jj 2020-12-16 > 16:10:30.013256862 +0100 > +++ gcc/testsuite/gcc.dg/tree-ssa/pr96239.c 2020-12-16 16:11:08.802822537 > +0100 > @@ -0,0 +1,18 @@ > +/* PR tree-optimization/96239 */ > +/* { dg-do compile { target { ilp32 || lp64 } } } */ > +/* { dg-options "-O3 -fdump-tree-optimized" } */ > +/* { dg-final { scan-tree-dump " r>> 8;" "optimized" { target bswap } } } */ > + > +union U { unsigned char c[2]; unsigned short s; }; > + > +unsigned short > +foo (unsigned short x) > +{ > + union U u; > + u.s = x; > + unsigned char v = u.c[0]; > + unsigned char w = u.c[1]; > + u.c[0] = w; > + u.c[1] = v; > + return u.s; > +} > > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)