On Tue, Dec 15, 2020 at 06:10:57PM +0800, Hongtao Liu via Gcc-patches wrote: > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -18187,21 +18187,67 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) > } > break; > > + case IX86_BUILTIN_SHUFPD512: > + case IX86_BUILTIN_SHUFPS512: > + if (n_args > 2) > + { > + /* This is masked shuffle. Only optimize if the mask is all ones. */ > + tree argl = gimple_call_arg (stmt, n_args - 1); > + arg0 = gimple_call_arg (stmt, 0); > + if (!tree_fits_uhwi_p (argl)) > + break; > + unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl); > + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
I think it would be better not to mix the argl and arg0 stuff. So e.g. do arg0 = gimple_call_arg (stmt, 0); unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); first and then the argl stuff, or vice versa. Furthermore, you don't really care about the upper bits of argl, so why don't punt just if (TREE_CODE (argl) != INTEGER_CST) and use mask = TREE_LOW_CST (argl); ? > + if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) > + break; > + } > + /* Fall thru. */ > case IX86_BUILTIN_SHUFPD: > + case IX86_BUILTIN_SHUFPD256: > + case IX86_BUILTIN_SHUFPS: > + case IX86_BUILTIN_SHUFPS256: > arg2 = gimple_call_arg (stmt, 2); > if (TREE_CODE (arg2) == INTEGER_CST) > { > - location_t loc = gimple_location (stmt); > - unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); > arg0 = gimple_call_arg (stmt, 0); > + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > + machine_mode imode = GET_MODE_INNER (TYPE_MODE (TREE_TYPE (arg0))); > + unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2); > + > + /* Check valid imm, refer to gcc.target/i386/testimm-10.c. */ > + if (imask > 255 > + || (imask >= HOST_WIDE_INT_1U << elems > + && imode == E_DFmode)) > + return false; Why is this extra checking done only for DFmode and not for SFmode? > + tree itype = imode == E_DFmode > + ? long_long_integer_type_node : integer_type_node; Formatting. Should be e.g. tree itype = (imode == E_DFmode ? long_long_integer_type_node : integer_type_node); or tree itype = (imode == E_DFmode ? long_long_integer_type_node : integer_type_node); but the ? which is part of the imode == E_DFmode ... subexpression can't just be below something unrelated. > + if (imode == E_DFmode) > + sel_idx = (i & 1) * elems > + + (i >> 1 << 1) + ((imask & 1 << i) >> i); Again, formatting. Plus, i >> 1 << 1 looks too ugly/unreadable, if you mean i & ~1, write it like that, it is up to the compiler to emit it like i >> 1 << 1 if that is the best implementation. > + else > + { > + /* Imm[7:0](if VL > 128, also use Imm[7:0]) provide 4 select > + controls for each element of the destination. */ > + unsigned j = i % 4; > + sel_idx = ((i & 2) >> 1) * elems > + + (i >> 2 << 2) + ((imask & 3 << j << j) >> j >> j); Ditto. Jakub