Re: [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR.

2021-07-28 Thread Hongtao Liu via Gcc-patches
ump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 
> > "vect" } } */
> > +#include
> > +void
> > +vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order)
> > +{
> > +  while (order--)
> > +*v3++ = (int16_t) *v1++ * *v2++;
> > +}
> > +
> > +void
> > +vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int 
> > order)
> > +{
> > +  while (order--)
> > +*v3++ = (uint16_t) *v1++ * *v2++;
> > +}
> > +
> > +void
> > +vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int 
> > order)
> > +{
> > +  while (order--)
> > +*v3++ = (int32_t) *v1++ * *v2++;
> > +}
> > +
> > +void
> > +vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int 
> > order)
> > +{
> > +  while (order--)
> > +*v3++ = (uint32_t) *v1++ * *v2++;
> > +}
> > +
> > +void
> > +vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int 
> > order)
> > +{
> > +  while (order--)
> > +*v3++ = (int64_t) *v1++ * *v2++;
> > +}
> > +
> > +void
> > +vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int 
> > order)
> > +{
> > +  while (order--)
> > +*v3++ = (uint64_t) *v1++ * *v2++;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/sse4-pr39821.c 
> > b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
> > new file mode 100644
> > index 000..4456c31e43e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
> > @@ -0,0 +1,4 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-msse4.1 -O3 -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 6 
> > "vect"} } */
> > +#include "sse2-pr39821.c"
> > --
> > 2.27.0
> >



-- 
BR,
Hongtao
From 0bae4cdd18a1645b4f70248e8986f2c83a814e8f Mon Sep 17 00:00:00 2001
From: liuhongt 
Date: Wed, 28 Jul 2021 16:24:52 +0800
Subject: [PATCH] [i386] Add a separate function to calculate cost for
 WIDEN_MULT_EXPR.

gcc/ChangeLog:

	PR target/39821
	* config/i386/i386.c (ix86_widen_mult_cost): New function.
	(ix86_add_stmt_cost): Use ix86_widen_mult_cost for
	WIDEN_MULT_EXPR.

gcc/testsuite/ChangeLog:

	PR target/39821
	* gcc.target/i386/sse2-pr39821.c: New test.
	* gcc.target/i386/sse4-pr39821.c: New test.
---
 gcc/config/i386/i386.c   | 48 +++-
 gcc/testsuite/gcc.target/i386/sse2-pr39821.c | 45 ++
 gcc/testsuite/gcc.target/i386/sse4-pr39821.c |  4 ++
 3 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr39821.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse4-pr39821.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 876a19f4c1f..5d49e0b45db 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19757,6 +19757,44 @@ ix86_vec_cost (machine_mode mode, int cost)
   return cost;
 }
 
+/* Return cost of vec_widen_mult_hi/lo_,
+   vec_widen_mul_hi/lo_ is only available for VI124_AVX2.  */
+static int
+ix86_widen_mult_cost (const struct processor_costs *cost,
+		  enum machine_mode mode, bool uns_p)
+{
+  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
+  int extra_cost = 0;
+  int basic_cost = 0;
+  switch (mode)
+{
+case V8HImode:
+case V16HImode:
+  if (!uns_p || mode == V16HImode)
+	extra_cost = cost->sse_op * 2;
+  basic_cost = cost->mulss * 2 + cost->sse_op * 4;
+  break;
+case V4SImode:
+case V8SImode:
+  /* pmulhw/pmullw can be used.  */
+  basic_cost = cost->mulss * 2 + cost->sse_op * 2;
+  break;
+case V2DImode:
+  /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend,
+	 require extra 4 mul, 4 add, 4 cmp and 2 shift.  */
+  if (!TARGET_SSE4_1 && !uns_p)
+	extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4
+		  + cost->sse_op * 2;
+  /* Fallthru.  */
+case V4DImode:
+  basic_cost = cost->mulss * 2 + cost->sse_op * 4;
+  break;
+default:
+  gcc_unreachable();
+}
+  return ix86_vec_cost (mode, basic_cost + extra_cost);
+}
+
 /* Return cost of multiplication in MODE.  */
 
 static int
@@ -22483,10 +22521,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count,
 	  break;
 
 	case MULT_EXPR:
-	case WIDEN_MULT_EXPR:
+	  /* For MULT_HIGHPART_EXPR, x86 only supports pmulhw,
+	 take it as MULT_EXPR.  */
 	case MULT_HIGHPART_EXPR:
 	  stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
 	  break;
+	  /* There's no direct

Re: [PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR.

2021-07-28 Thread Richard Biener via Gcc-patches
On Wed, Jul 28, 2021 at 10:35 AM liuhongt  wrote:
>
> Hi:
>   As described in PR 39821, WIDEN_MULT_EXPR should use a different cost
> model from MULT_EXPR, this patch add ix86_widen_mult_cost for that.
> Reference basis for the cost model is https://godbolt.org/z/EMjaz4Knn.
>
>   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
>
> gcc/ChangeLog:

can you reference PR target/39821 please?

> * config/i386/i386.c (ix86_widen_mult_cost): New function.
> (ix86_add_stmt_cost): Use ix86_widen_mult_cost for
> WIDEN_MULT_EXPR.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/sse2-pr39821.c: New test.
> * gcc.target/i386/sse4-pr39821.c: New test.
> ---
>  gcc/config/i386/i386.c   | 48 +++-
>  gcc/testsuite/gcc.target/i386/sse2-pr39821.c | 45 ++
>  gcc/testsuite/gcc.target/i386/sse4-pr39821.c |  4 ++
>  3 files changed, 96 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr39821.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/sse4-pr39821.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 876a19f4c1f..281b5fe2706 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -19757,6 +19757,44 @@ ix86_vec_cost (machine_mode mode, int cost)
>return cost;
>  }
>
> +/* Return cost of vec_widen_mult_hi/lo_,
> +   vec_widen_mul_hi/lo_ is only available for VI124_AVX2.  */
> +static int
> +ix86_widen_mult_cost (const struct processor_costs *cost,
> + enum machine_mode mode, bool uns_p)
> +{
> +  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
> +  int extra_cost = 0;
> +  int basic_cost = 0;
> +  switch (mode)
> +{
> +case V8HImode:
> +case V16HImode:
> +  if (!uns_p || mode == V16HImode)
> +   extra_cost = cost->sse_op * 2;
> +  basic_cost = cost->mulss * 2 + cost->sse_op * 4;
> +  break;
> +case V4SImode:
> +case V8SImode:
> +  /* pmulhw/pmullw can be used.  */
> +  basic_cost = cost->mulss * 2 + cost->sse_op * 2;
> +  break;
> +case V2DImode:
> +  /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend,
> +require extra 4 mul, 4 add, 4 cmp and 2 shift.  */
> +  if (!TARGET_SSE4_1 && !uns_p)
> +   extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4
> + + cost->sse_op * 2;
> +  /* Fallthru.  */
> +case V4DImode:
> +  basic_cost = cost->mulss * 2 + cost->sse_op * 4;
> +  break;
> +default:
> +  gcc_unreachable();
> +}
> +  return ix86_vec_cost (mode, basic_cost + extra_cost);
> +}
> +
>  /* Return cost of multiplication in MODE.  */
>
>  static int
> @@ -22483,10 +22521,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void 
> *data, int count,
>   break;
>
> case MULT_EXPR:
> -   case WIDEN_MULT_EXPR:
> + /*For MULT_HIGHPART_EXPR, x86 only supports pmulhw,

Space after /*

otherwise OK.

> +   take it as MULT_EXPR.  */
> case MULT_HIGHPART_EXPR:
>   stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
>   break;
> + /* There's no direct instruction for WIDEN_MULT_EXPR,
> +take emulation into account.  */
> +   case WIDEN_MULT_EXPR:
> + stmt_cost = ix86_widen_mult_cost (ix86_cost, mode,
> +   TYPE_UNSIGNED (vectype));
> + break;
> +
> case NEGATE_EXPR:
>   if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
> stmt_cost = ix86_cost->sse_op;
> diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr39821.c 
> b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
> new file mode 100644
> index 000..bcd4b772c98
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
> @@ -0,0 +1,45 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2 -mno-sse4.1 -O3 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 
> "vect" } } */
> +#include
> +void
> +vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order)
> +{
> +  while (order--)
> +*v3++ = (int16_t) *v1++ * *v2++;
> +}
> +
> +void
> +vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int order)
> +{
> +  while (order--)
> +*v3++ = (uint16_t) *v1++ * *v2++;
> +}
> +
> +void
> +vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int order)
> +{
> +  while (order--)
> +*v3++ = (int32_t) *v1++ * *v2++;
> +}
> +
> +void
> +vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int 
> order)
> +{
> +  while (order--)
> +*v3++ = (uint32_t) *v1++ * *v2++;
> +}
> +
> +void
> +vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int order)
> +{
> +  while (order--)
> +*v3++ = (int64_t) *v1++ * *v2++;
> +}
> +
> +void
> +vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int 
> order)
> +{
> +  while (order--)
> +  

[PATCH] [i386] Add a separate function to calculate cost for WIDEN_MULT_EXPR.

2021-07-28 Thread liuhongt via Gcc-patches
Hi:
  As described in PR 39821, WIDEN_MULT_EXPR should use a different cost
model from MULT_EXPR, this patch add ix86_widen_mult_cost for that.
Reference basis for the cost model is https://godbolt.org/z/EMjaz4Knn.

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.

gcc/ChangeLog:

* config/i386/i386.c (ix86_widen_mult_cost): New function.
(ix86_add_stmt_cost): Use ix86_widen_mult_cost for
WIDEN_MULT_EXPR.

gcc/testsuite/ChangeLog:

* gcc.target/i386/sse2-pr39821.c: New test.
* gcc.target/i386/sse4-pr39821.c: New test.
---
 gcc/config/i386/i386.c   | 48 +++-
 gcc/testsuite/gcc.target/i386/sse2-pr39821.c | 45 ++
 gcc/testsuite/gcc.target/i386/sse4-pr39821.c |  4 ++
 3 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-pr39821.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse4-pr39821.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 876a19f4c1f..281b5fe2706 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19757,6 +19757,44 @@ ix86_vec_cost (machine_mode mode, int cost)
   return cost;
 }
 
+/* Return cost of vec_widen_mult_hi/lo_,
+   vec_widen_mul_hi/lo_ is only available for VI124_AVX2.  */
+static int
+ix86_widen_mult_cost (const struct processor_costs *cost,
+ enum machine_mode mode, bool uns_p)
+{
+  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
+  int extra_cost = 0;
+  int basic_cost = 0;
+  switch (mode)
+{
+case V8HImode:
+case V16HImode:
+  if (!uns_p || mode == V16HImode)
+   extra_cost = cost->sse_op * 2;
+  basic_cost = cost->mulss * 2 + cost->sse_op * 4;
+  break;
+case V4SImode:
+case V8SImode:
+  /* pmulhw/pmullw can be used.  */
+  basic_cost = cost->mulss * 2 + cost->sse_op * 2;
+  break;
+case V2DImode:
+  /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend,
+require extra 4 mul, 4 add, 4 cmp and 2 shift.  */
+  if (!TARGET_SSE4_1 && !uns_p)
+   extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4
+ + cost->sse_op * 2;
+  /* Fallthru.  */
+case V4DImode:
+  basic_cost = cost->mulss * 2 + cost->sse_op * 4;
+  break;
+default:
+  gcc_unreachable();
+}
+  return ix86_vec_cost (mode, basic_cost + extra_cost);
+}
+
 /* Return cost of multiplication in MODE.  */
 
 static int
@@ -22483,10 +22521,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void 
*data, int count,
  break;
 
case MULT_EXPR:
-   case WIDEN_MULT_EXPR:
+ /*For MULT_HIGHPART_EXPR, x86 only supports pmulhw,
+   take it as MULT_EXPR.  */
case MULT_HIGHPART_EXPR:
  stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
  break;
+ /* There's no direct instruction for WIDEN_MULT_EXPR,
+take emulation into account.  */
+   case WIDEN_MULT_EXPR:
+ stmt_cost = ix86_widen_mult_cost (ix86_cost, mode,
+   TYPE_UNSIGNED (vectype));
+ break;
+
case NEGATE_EXPR:
  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
stmt_cost = ix86_cost->sse_op;
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr39821.c 
b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
new file mode 100644
index 000..bcd4b772c98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -mno-sse4.1 -O3 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 "vect" 
} } */
+#include
+void
+vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order)
+{
+  while (order--)
+*v3++ = (int16_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int order)
+{
+  while (order--)
+*v3++ = (uint16_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int order)
+{
+  while (order--)
+*v3++ = (int32_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int 
order)
+{
+  while (order--)
+*v3++ = (uint32_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int order)
+{
+  while (order--)
+*v3++ = (int64_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int 
order)
+{
+  while (order--)
+*v3++ = (uint64_t) *v1++ * *v2++;
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse4-pr39821.c 
b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
new file mode 100644
index 000..4456c31e43e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
@@ -0,0 +1,4 @@
+/* { dg-do compile } */
+/* { dg-options "-msse4.1 -O3 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectoriz