On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
> move is enabled since x86 uses vec_duplicate, which is enabled only when
> inter-unit move is enabled, to implement store_by_pieces.
> 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
> STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
> compare_by_pieces.
>
> gcc/
>
>         PR target/101742
>         * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
>         STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
>         for compare_by_pieces.
>         * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
>         only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
>
> gcc/testsuite/
>
>         PR target/101742
>         * gcc.target/i386/pr101742a.c: New test.
>         * gcc.target/i386/pr101742b.c: Likewise.
> ---
>  gcc/config/i386/i386.h                    | 20 +++++++++++---------
>  gcc/expr.c                                |  6 +++++-
>  gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr101742b.c |  4 ++++
>  4 files changed, 36 insertions(+), 10 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c
>
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index bed9cd9da18..9b416abd5f4 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -1783,15 +1783,17 @@ typedef struct ix86_args {
>  /* STORE_MAX_PIECES is the number of bytes at a time that we can
>     store efficiently.  */
>  #define STORE_MAX_PIECES \
> -  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> -   ? 64 \
> -   : ((TARGET_AVX \
> -       && !TARGET_PREFER_AVX128 \
> -       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> -      ? 32 \
> -      : ((TARGET_SSE2 \
> -         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> -        ? 16 : UNITS_PER_WORD)))
> +  (TARGET_INTER_UNIT_MOVES_TO_VEC \
> +   ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> +      ? 64 \
> +      : ((TARGET_AVX \
> +         && !TARGET_PREFER_AVX128 \
> +         && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> +         ? 32 \
> +         : ((TARGET_SSE2 \
> +             && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> +             ? 16 : UNITS_PER_WORD))) \
> +   : UNITS_PER_WORD)
>
>  /* If a memory-to-memory move would take MOVE_RATIO or more simple
>     move-instruction pairs, we will do a cpymem or libcall instead.
> diff --git a/gcc/expr.c b/gcc/expr.c
> index b65cfcfdcd1..2964b38b9a5 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -1131,7 +1131,11 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
>                                 bool qi_vector_mode)
>    : m_to (to, to_load, NULL, NULL),
>      m_from (from, from_load, from_cfn, from_cfn_data),
> -    m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
> +    m_len (len),
> +    m_max_size (((!to_load && from == nullptr)
> +                ? STORE_MAX_PIECES
> +                : (from_cfn != nullptr
> +                   ? COMPARE_MAX_PIECES : MOVE_MAX_PIECES)) + 1),
>      m_push (push), m_qi_vector_mode (qi_vector_mode)
>  {
>    int toi = m_to.get_addr_inc ();

This larger expr.c patch passes the proper MAX_PIECES directly.

> diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c 
> b/gcc/testsuite/gcc.target/i386/pr101742a.c
> new file mode 100644
> index 00000000000..67ea40587dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101742a.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -mtune=nano-x2" } */
> +
> +int n2;
> +
> +__attribute__ ((simd)) char
> +w7 (void)
> +{
> +  short int xb = n2;
> +  int qp;
> +
> +  for (qp = 0; qp < 2; ++qp)
> +    xb = xb < 1;
> +
> +  return xb;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c 
> b/gcc/testsuite/gcc.target/i386/pr101742b.c
> new file mode 100644
> index 00000000000..ba19064077b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101742b.c
> @@ -0,0 +1,4 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -mtune=nano-x2 
> -mtune-ctrl=sse_unaligned_store_optimal" } */
> +
> +#include "pr101742a.c"
> --
> 2.31.1
>


-- 
H.J.
diff --git a/gcc/expr.c b/gcc/expr.c
index b65cfcfdcd1..66ac1986f02 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1110,8 +1110,8 @@ class op_by_pieces_d
   }
 
  public:
-  op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
-		  unsigned HOST_WIDE_INT, unsigned int, bool,
+  op_by_pieces_d (unsigned int, rtx, bool, rtx, bool, by_pieces_constfn,
+		  void *, unsigned HOST_WIDE_INT, unsigned int, bool,
 		  bool = false);
   void run ();
 };
@@ -1122,8 +1122,8 @@ class op_by_pieces_d
    and its associated FROM_CFN_DATA can be used to replace loads with
    constant values.  LEN describes the length of the operation.  */
 
-op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
-				rtx from, bool from_load,
+op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to,
+				bool to_load, rtx from, bool from_load,
 				by_pieces_constfn from_cfn,
 				void *from_cfn_data,
 				unsigned HOST_WIDE_INT len,
@@ -1131,7 +1131,7 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 				bool qi_vector_mode)
   : m_to (to, to_load, NULL, NULL),
     m_from (from, from_load, from_cfn, from_cfn_data),
-    m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+    m_len (len), m_max_size (max_pieces + 1),
     m_push (push), m_qi_vector_mode (qi_vector_mode)
 {
   int toi = m_to.get_addr_inc ();
@@ -1324,8 +1324,8 @@ class move_by_pieces_d : public op_by_pieces_d
  public:
   move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len,
 		    unsigned int align)
-    : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align,
-		      PUSHG_P (to))
+    : op_by_pieces_d (MOVE_MAX_PIECES, to, false, from, true, NULL,
+		      NULL, len, align, PUSHG_P (to))
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1421,8 +1421,8 @@ class store_by_pieces_d : public op_by_pieces_d
   store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
 		     unsigned HOST_WIDE_INT len, unsigned int align,
 		     bool qi_vector_mode)
-    : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
-		      align, false, qi_vector_mode)
+    : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true, cfn,
+		      cfn_data, len, align, false, qi_vector_mode)
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1618,8 +1618,8 @@ class compare_by_pieces_d : public op_by_pieces_d
   compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn,
 		       void *op1_cfn_data, HOST_WIDE_INT len, int align,
 		       rtx_code_label *fail_label)
-    : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len,
-		      align, false)
+    : op_by_pieces_d (COMPARE_MAX_PIECES, op0, true, op1, true, op1_cfn,
+		      op1_cfn_data, len, align, false)
   {
     m_fail_label = fail_label;
   }

Reply via email to