On Sat, May 18, 2024 at 4:10 AM Roger Sayle <ro...@nextmovesoftware.com> wrote: > > > Hi Hongtao, > Many thanks for the review, bug fixes and suggestions for improvements. > This revised version of the patch, implements all of your corrections. In > theory > the "ternlog idx" should guarantee that some operands are non-null, but I > agree > that it's better defensive programming to check invariants not easily proved. > Instead of calling ix86_expand_vector_move, I use ix86_broadcast_from_constant > to achieve the same effect of using a broadcast when possible, but has the > benefit > of still using a memory operand (instead of a vector load) when broadcasting > isn't > possible. There are other places that could benefit from the same trick, but > I can > address these in a follow-up patch (it may even be preferrable to keep these > as > CONST_VECTOR during early RTL passes and lower to broadcast or constant pool > using splitters). > > This revised patch has been tested on x86_64-pc-linux-gnu with make bootstrap > and make -k check, both with and without --target_board=unix{-m32} > with no new failures. Ok for mainline? 1 file changed, 41 insertions(+) gcc/config/i386/i386-expand.cc | 41 +++++++++++++++++++++++++++++++++++++++++
modified gcc/config/i386/i386-expand.cc @@ -25579,14 +25579,22 @@ ix86_gen_bcst_mem (machine_mode mode, rtx x) && !CONST_DOUBLE_P (cst) && !CONST_FIXED_P (cst)) return NULL_RTX; + /* I think VALID_BCST_MODE_P should be sufficient to + make sure cst is CONST_INT or CONST_DOUBLE. */ int n_elts = GET_MODE_NUNITS (mode); if (CONST_VECTOR_NUNITS (x) != n_elts) return NULL_RTX; + /* Do we need this? I saw from caller side there's already + if (GET_MODE (op2) != mode) + op2 = gen_lowpart (mode, op2); + tmp2 = ix86_gen_bcst_mem (mode, op2); */ + for (int i = 1; i < n_elts; i++) if (!rtx_equal_p (cst, CONST_VECTOR_ELT (x, i))) return NULL_RTX; + /* CONST_VECTOR_DUPLICATE_P (op)? */ rtx mem = force_const_mem (GET_MODE_INNER (mode), cst); return gen_rtx_VEC_DUPLICATE (mode, validize_mem (mem)); @@ -25709,6 +25717,21 @@ ix86_ternlog_idx (rtx op, rtx *args) || ix86_ternlog_idx (XVECEXP (op, 0, 2), args) != 0xaa) return -1; return INTVAL (XVECEXP (op, 0, 3)); + /* I think we can add some testcase for this. + .i.e + #include <immintrin.h> + + __m256i + foo (__m256i a, __m256i b, __m256i c) + { + return (a & _mm256_ternarylogic_epi64 (a, b, c, 0xe4)); + } + + __m256i + foo1 (__m256i a, __m256i b, __m256i c) + { + return (b & _mm256_ternarylogic_epi64 (a, b, c, 0xe4)); + } */ default: return -1; @@ -25778,6 +25801,8 @@ ix86_ternlog_operand_p (rtx op) if (ix86_ternlog_leaf_p (XEXP (op, 0), mode) && (ix86_ternlog_leaf_p (op1, mode) || vector_all_ones_operand (op1, mode))) + /* There's CONST_VECTOR check in x86_ternlog_leaf_p, + so vector_all_ones_operand is not needed. */ return false; break; @@ -25862,6 +25887,10 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, if ((!op0 || !side_effects_p (op0)) && (!op1 || !side_effects_p (op1)) && (!op2 || !side_effects_p (op2))) + /* I think only op2 needs to check side_effects_p, op0 + and op1 must be register operand when it exists, no need for side_effects_p? + Similar for all below side_effects_p (op0/op1) + the check is redundant. */ { emit_move_insn (target, CONST0_RTX (mode)); return target; @@ -25872,6 +25901,9 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, if ((!op1 || !side_effects_p (op1)) && op0 && register_operand (op0, mode) && op2 && register_operand (op2, mode)) + /* op0/op1 must be register_operand when it exists, + so register_operand (op0/op1, mode) is not needed. + similar for all below register_operand (op0/op1, mode). */ return ix86_expand_ternlog_andnot (mode, op0, op2, target); break; @@ -25879,6 +25911,7 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, if ((!op2 || !side_effects_p (op2)) && op0 && register_operand (op0, mode) && op1 && register_operand (op1, mode)) + /* op0 && op1? */ return ix86_expand_ternlog_andnot (mode, op0, op1, target); break; @@ -25948,6 +25981,7 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, if ((!op0 || !side_effects_p (op0)) && (!op1 || !side_effects_p (op1)) && op2) + /* if (op2). */ { if (GET_MODE (op2) != mode) op2 = gen_lowpart (mode, op2); @@ -25961,18 +25995,21 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, case 0x5a: /* a^c */ if (op0 && op2 && (!op1 || !side_effects_p (op1))) + /* if (op0 && op2). */ return ix86_expand_ternlog_binop (XOR, mode, op0, op2, target); break; case 0x66: /* b^c */ if ((!op0 || !side_effects_p (op0)) && op1 && op2) + /* if (op1 && op2). */ return ix86_expand_ternlog_binop (XOR, mode, op1, op2, target); break; case 0x88: /* b&c */ if ((!op0 || !side_effects_p (op0)) && op1 && op2) + /* if (op1 && op2). */ return ix86_expand_ternlog_binop (AND, mode, op1, op2, target); break; @@ -26054,6 +26091,9 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, } tmp0 = register_operand (op0, mode) ? op0 : force_reg (mode, op0); + /* Do you observe there're cases of op0 not register_operand?. + if it's from <avx512>_vternlog<mode>_mask, it must be register_operand. + if it's from ix86_ternlog_idx, it must REG_P. */ if (GET_MODE (tmp0) != mode) tmp0 = gen_lowpart (mode, tmp0); @@ -26061,6 +26101,7 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, tmp1 = copy_rtx (tmp0); else if (!register_operand (op1, mode)) tmp1 = force_reg (mode, op1); + /* Ditto. */ else tmp1 = op1; if (GET_MODE (tmp1) != mode) -- BR, Hongtao