On Sat, May 18, 2024 at 4:10 AM Roger Sayle <ro...@nextmovesoftware.com> wrote:
>
>
> Hi Hongtao,
> Many thanks for the review, bug fixes and suggestions for improvements.
> This revised version of the patch, implements all of your corrections.  In 
> theory
> the "ternlog idx" should guarantee that some operands are non-null, but I 
> agree
> that it's better defensive programming to check invariants not easily proved.
> Instead of calling ix86_expand_vector_move, I use ix86_broadcast_from_constant
> to achieve the same effect of using a broadcast when possible, but has the 
> benefit
> of still using a memory operand (instead of a vector load) when broadcasting 
> isn't
> possible.  There are other places that could benefit from the same trick, but 
> I can
> address these in a follow-up patch (it may even be preferrable to keep these 
> as
> CONST_VECTOR during early RTL passes and lower to broadcast or constant pool
> using splitters).
>
> This revised patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
1 file changed, 41 insertions(+)
gcc/config/i386/i386-expand.cc | 41 +++++++++++++++++++++++++++++++++++++++++

modified   gcc/config/i386/i386-expand.cc
@@ -25579,14 +25579,22 @@ ix86_gen_bcst_mem (machine_mode mode, rtx x)
       && !CONST_DOUBLE_P (cst)
       && !CONST_FIXED_P (cst))
     return NULL_RTX;
+  /* I think VALID_BCST_MODE_P should be sufficient to
+     make sure cst is CONST_INT or CONST_DOUBLE.  */

   int n_elts = GET_MODE_NUNITS (mode);
   if (CONST_VECTOR_NUNITS (x) != n_elts)
     return NULL_RTX;
+  /* Do we need this? I saw from caller side there's already
+       if (GET_MODE (op2) != mode)
+ op2 = gen_lowpart (mode, op2);
+ tmp2 = ix86_gen_bcst_mem (mode, op2);  */
+

   for (int i = 1; i < n_elts; i++)
     if (!rtx_equal_p (cst, CONST_VECTOR_ELT (x, i)))
       return NULL_RTX;
+  /* CONST_VECTOR_DUPLICATE_P (op)? */

   rtx mem = force_const_mem (GET_MODE_INNER (mode), cst);
   return gen_rtx_VEC_DUPLICATE (mode, validize_mem (mem));
@@ -25709,6 +25717,21 @@ ix86_ternlog_idx (rtx op, rtx *args)
    || ix86_ternlog_idx (XVECEXP (op, 0, 2), args) != 0xaa)
  return -1;
       return INTVAL (XVECEXP (op, 0, 3));
+      /* I think we can add some testcase for this.
+ .i.e
+ #include <immintrin.h>
+
+ __m256i
+ foo (__m256i a, __m256i b, __m256i c)
+ {
+ return (a & _mm256_ternarylogic_epi64 (a, b, c, 0xe4));
+ }
+
+ __m256i
+ foo1 (__m256i a, __m256i b, __m256i c)
+ {
+ return (b & _mm256_ternarylogic_epi64 (a, b, c, 0xe4));
+ }  */

     default:
       return -1;
@@ -25778,6 +25801,8 @@ ix86_ternlog_operand_p (rtx op)
       if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
    && (ix86_ternlog_leaf_p (op1, mode)
        || vector_all_ones_operand (op1, mode)))
+ /* There's CONST_VECTOR check in x86_ternlog_leaf_p,
+    so vector_all_ones_operand is not needed.  */
  return false;
       break;

@@ -25862,6 +25887,10 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
       if ((!op0 || !side_effects_p (op0))
           && (!op1 || !side_effects_p (op1))
           && (!op2 || !side_effects_p (op2)))
+ /* I think only op2 needs to check side_effects_p, op0
+    and op1 must be register operand when it exists, no need for
side_effects_p?
+    Similar for all below side_effects_p (op0/op1)
+    the check is redundant.  */
         {
    emit_move_insn (target, CONST0_RTX (mode));
    return target;
@@ -25872,6 +25901,9 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
       if ((!op1 || !side_effects_p (op1))
    && op0 && register_operand (op0, mode)
    && op2 && register_operand (op2, mode))
+ /* op0/op1 must be register_operand when it exists,
+    so register_operand (op0/op1, mode) is not needed.
+    similar for all below register_operand (op0/op1, mode).  */
  return ix86_expand_ternlog_andnot (mode, op0, op2, target);
       break;

@@ -25879,6 +25911,7 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
       if ((!op2 || !side_effects_p (op2))
    && op0 && register_operand (op0, mode)
    && op1 && register_operand (op1, mode))
+ /* op0 && op1? */
  return ix86_expand_ternlog_andnot (mode, op0, op1, target);
       break;

@@ -25948,6 +25981,7 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
       if ((!op0 || !side_effects_p (op0))
    && (!op1 || !side_effects_p (op1))
           && op2)
+ /* if (op2).  */
  {
    if (GET_MODE (op2) != mode)
      op2 = gen_lowpart (mode, op2);
@@ -25961,18 +25995,21 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
     case 0x5a:  /* a^c */
       if (op0 && op2
           && (!op1 || !side_effects_p (op1)))
+ /* if (op0 && op2).  */
  return ix86_expand_ternlog_binop (XOR, mode, op0, op2, target);
       break;

     case 0x66:  /* b^c */
       if ((!op0 || !side_effects_p (op0))
           && op1 && op2)
+ /* if (op1 && op2).  */
  return ix86_expand_ternlog_binop (XOR, mode, op1, op2, target);
       break;

     case 0x88:  /* b&c */
       if ((!op0 || !side_effects_p (op0))
           && op1 && op2)
+ /* if (op1 && op2).  */
  return ix86_expand_ternlog_binop (AND, mode, op1, op2, target);
       break;

@@ -26054,6 +26091,9 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
     }

   tmp0 = register_operand (op0, mode) ? op0 : force_reg (mode, op0);
+  /* Do you observe there're cases of op0 not register_operand?.
+     if it's from <avx512>_vternlog<mode>_mask, it must be register_operand.
+     if it's from ix86_ternlog_idx, it must REG_P.  */
   if (GET_MODE (tmp0) != mode)
     tmp0 = gen_lowpart (mode, tmp0);

@@ -26061,6 +26101,7 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
     tmp1 = copy_rtx (tmp0);
   else if (!register_operand (op1, mode))
     tmp1 = force_reg (mode, op1);
+  /* Ditto.  */
   else
     tmp1 = op1;
   if (GET_MODE (tmp1) != mode)




-- 
BR,
Hongtao

Reply via email to