Hi Richard,

I've revised the patch, sorry about sloppy formatting in the previous one.

Full bootstrap/regression tests are still running, but the changes are pretty 
trivial.

Ok for trunk assuming tests finish clean?

>Joel Hutton <joel.hut...@arm.com> writes:
>> @@ -277,6 +277,81 @@ optab_for_tree_code (enum tree_code code, const_tree 
>> type,
>>      }
>>  }
>>
>> +/* Function supportable_half_widening_operation
>> +
>
>I realise existing (related) functions do have a “Function foo” line,
>but it's not generally the GCC style, so I think it would be better
>to leave it out.
Done.

>> +   Check whether an operation represented by code is a 'half' widening 
>> operation
>
>Nit: CODE should be in caps since it's a parameter name.
Done.

>> +   using only half of the full vector width for the inputs. This allows the
>
>I think this could be misread as saying that we only look at half the
>input vector.  Maybe it would be clearer to say something like “in which
>the input vectors contain half as many bits as the output vectors”
>instead of “using…”.
Done.

>> +   output to be stored in a single vector, as opposed to using the full 
>> width
>> +   for input, where half the elements must be processed at a time into
>> +   respective full-width vector outputs i.e. a 'hi'/'lo' pair using codes 
>> such
>> +   as VEC_WIDEN_MINUS_HI/LO. This is handled by widening the inputs and 
>> using
>> +   non-widening instructions. RTL fusing converts these to the widening 
>> hardware
>> +   instructions if supported.
>
>And here the contrast is with cases in which the input vectors have
>the same number of bits as the output vectors, meaning that we need
>a lo/hi pair to generate twice the amount of output data.
Done.

> Might be clearer to make “This is handled…” a new paragraph.
Done.

>> +
>> +   Supported widening operations:
>> +    WIDEN_MINUS_EXPR
>> +    WIDEN_PLUS_EXPR
>> +    WIDEN_MULT_EXPR
>> +    WIDEN_LSHIFT_EXPR
>> +
>> +   Output:
>> +   - CODE1 - The non-widened code, which will be used after the inputs are
>> +     converted to the wide type.
>> +   */
>
>Formatting nit, */ should be on the previous line, i.e. “.” + two spaces
>+ “*/”.  (Sorry for all the formatting comments.)
Done (sorry for the formatting errors)

>> +bool
>> +supportable_half_widening_operation (enum tree_code code,
>> +                            tree vectype_out, tree vectype_in,
>> +                            enum tree_code *code1)
>> +{
>> +  machine_mode m1,m2;
>> +  bool truncp;
>> +  enum tree_code dummy_code;
>> +  optab op;
>> +
>> +  gcc_assert (VECTOR_TYPE_P (vectype_out) && VECTOR_TYPE_P (vectype_in));
>> +
>> +  m1 = TYPE_MODE (vectype_out);
>> +  m2 = TYPE_MODE (vectype_in);
>> +
>> +  if (!VECTOR_MODE_P (m1) || !VECTOR_MODE_P (m2))
>> +    return false;
>> +
>> +  if(!known_eq (TYPE_VECTOR_SUBPARTS (vectype_in),
>> +               TYPE_VECTOR_SUBPARTS (vectype_out)))
>> +      return false;
>
>Formatting nit: should be a space between “if” and “(”.  IMO
>maybe_ne is more readable than !known_eq.  The return statement
>should only be indented by 4 columns.
Done.

>> +  if(!(code == WIDEN_MINUS_EXPR
>> +       || code == WIDEN_PLUS_EXPR
>> +       || code == WIDEN_MULT_EXPR
>> +       || code == WIDEN_LSHIFT_EXPR))
>> +    return false;
>> +
>> +  switch (code)
>> +  {
>> +    case WIDEN_LSHIFT_EXPR:
>> +      *code1 = LSHIFT_EXPR;
>> +      break;
>> +    case WIDEN_MINUS_EXPR:
>> +      *code1 = MINUS_EXPR;
>> +      break;
>> +    case WIDEN_PLUS_EXPR:
>> +      *code1 = PLUS_EXPR;
>> +      break;
>> +    case WIDEN_MULT_EXPR:
>> +      *code1 = MULT_EXPR;
>> +      break;
>> +    default:
>> +      gcc_unreachable ();
>
>I think it would be better to return false in the default case and remove
>the “if” above, so that we only have one copy of the list of codes.
Done.

>> +  }
>
>Formatting nit: the { and } lines should be indented two spaces more.
>(The contents of the switch are OK as-is.)
Done.

>> +
>> +  if (!supportable_convert_operation(NOP_EXPR, vectype_out, vectype_in,
>> +                                  &dummy_code))
>
>Nit: should be a space between the function name and ”(”.
Done.

>> +    return false;
>> +
>> +  op = optab_for_tree_code (*code1, vectype_out, optab_vector);
>> +  return (optab_handler (op, TYPE_MODE (vectype_out)) != CODE_FOR_nothing);
>> +}
>> +
>>  /* Function supportable_convert_operation
>>
>>     Check whether an operation represented by the code CODE is a
>
>> […]
>> @@ -4697,7 +4755,13 @@ vectorizable_conversion (vec_info *vinfo,
>>    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
>>    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
>>    if (known_eq (nunits_out, nunits_in))
>> -    modifier = NONE;
>> +    if (code == WIDEN_MINUS_EXPR
>> +     || code == WIDEN_PLUS_EXPR
>> +     || code == WIDEN_LSHIFT_EXPR
>> +     || code == WIDEN_MULT_EXPR)
>> +     modifier = WIDEN;
>
>The “modifier = WIDEN;” line is indented by a tab (i.e. 8 columns),
>but it should only be indented by 6 spaces.
Done.

>> +    else
>> +      modifier = NONE;
>>    else if (multiple_p (nunits_out, nunits_in))
>>      modifier = NARROW;
>>    else
>> @@ -4743,9 +4807,16 @@ vectorizable_conversion (vec_info *vinfo,
>>        return false;
>>
>>      case WIDEN:
>> -      if (supportable_widening_operation (vinfo, code, stmt_info, 
>> vectype_out,
>> -                                       vectype_in, &code1, &code2,
>> -                                       &multi_step_cvt, &interm_types))
>> +      if (supportable_half_widening_operation (code, vectype_out, 
>> vectype_in,
>> +                                         &code1))
>> +     {
>> +       gcc_assert (!(multi_step_cvt && op_type == binary_op));
>> +       break;
>> +     }
>
>I think here we should keep the known_eq test from earlier and not
>go into the code below for that case.  I.e. something like:
>
>       if (known_eq (…))
>         {
>           if (!supportable_half_widening_operation (code, vectype_out,
>                                                     vectype_in, &code1))
>             goto unsupported;
>
>           gcc_assert (!(multi_step_cvt && op_type == binary_op));
>           break;
>         }
>
>This is because the fallback code later in the case statement doesn't
>really apply here.  It also makes it easier to correlate this test
>with the transform code below.
Done.

>> +      else if (supportable_widening_operation (vinfo, code, stmt_info,
>> +                                            vectype_out, vectype_in, &code1,
>> +                                            &code2, &multi_step_cvt,
>> +                                            &interm_types))
>
>After the change above, this would stay as a plain “if”.
Done.

>>        {
>>          /* Binary widening operation can only be supported directly by the
>>             architecture.  */
>> @@ -4981,10 +5052,16 @@ vectorizable_conversion (vec_info *vinfo,
>>              c1 = codecvt1;
>>              c2 = codecvt2;
>>            }
>> -       vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
>> -                                               &vec_oprnds1, stmt_info,
>> -                                               this_dest, gsi,
>> -                                               c1, c2, op_type);
>> +       if (known_eq(nunits_out, nunits_in))
>
>Should be a space between “known_eq” and “(”.
>
>> +         vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
>> +                                                 &vec_oprnds1, stmt_info,
>> +                                                 this_dest, gsi,
>> +                                                 c1, op_type);
>
>Maybe it would be clearer to call this vect_create_half_widening_stmts,
>to match the new query function.
Done.
________________________________
From: Richard Sandiford <richard.sandif...@arm.com>
Sent: 10 February 2021 19:46
To: Joel Hutton <joel.hut...@arm.com>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>; Richard Biener <rguent...@suse.de>
Subject: Re: [aarch64][vect] Support V8QI->V8HI WIDEN_ patterns

Joel Hutton <joel.hut...@arm.com> writes:
> @@ -277,6 +277,81 @@ optab_for_tree_code (enum tree_code code, const_tree 
> type,
>      }
>  }
>
> +/* Function supportable_half_widening_operation
> +

I realise existing (related) functions do have a “Function foo” line,
but it's not generally the GCC style, so I think it would be better
to leave it out.

> +   Check whether an operation represented by code is a 'half' widening 
> operation

Nit: CODE should be in caps since it's a parameter name.

> +   using only half of the full vector width for the inputs. This allows the

I think this could be misread as saying that we only look at half the
input vector.  Maybe it would be clearer to say something like “in which
the input vectors contain half as many bits as the output vectors”
instead of “using…”.

> +   output to be stored in a single vector, as opposed to using the full width
> +   for input, where half the elements must be processed at a time into
> +   respective full-width vector outputs i.e. a 'hi'/'lo' pair using codes 
> such
> +   as VEC_WIDEN_MINUS_HI/LO. This is handled by widening the inputs and using
> +   non-widening instructions. RTL fusing converts these to the widening 
> hardware
> +   instructions if supported.

And here the contrast is with cases in which the input vectors have
the same number of bits as the output vectors, meaning that we need
a lo/hi pair to generate twice the amount of output data.

Might be clearer to make “This is handled…” a new paragraph.

> +
> +   Supported widening operations:
> +    WIDEN_MINUS_EXPR
> +    WIDEN_PLUS_EXPR
> +    WIDEN_MULT_EXPR
> +    WIDEN_LSHIFT_EXPR
> +
> +   Output:
> +   - CODE1 - The non-widened code, which will be used after the inputs are
> +     converted to the wide type.
> +   */

Formatting nit, */ should be on the previous line, i.e. “.” + two spaces
+ “*/”.  (Sorry for all the formatting comments.)

> +bool
> +supportable_half_widening_operation (enum tree_code code,
> +                            tree vectype_out, tree vectype_in,
> +                            enum tree_code *code1)
> +{
> +  machine_mode m1,m2;
> +  bool truncp;
> +  enum tree_code dummy_code;
> +  optab op;
> +
> +  gcc_assert (VECTOR_TYPE_P (vectype_out) && VECTOR_TYPE_P (vectype_in));
> +
> +  m1 = TYPE_MODE (vectype_out);
> +  m2 = TYPE_MODE (vectype_in);
> +
> +  if (!VECTOR_MODE_P (m1) || !VECTOR_MODE_P (m2))
> +    return false;
> +
> +  if(!known_eq (TYPE_VECTOR_SUBPARTS (vectype_in),
> +               TYPE_VECTOR_SUBPARTS (vectype_out)))
> +      return false;

Formatting nit: should be a space between “if” and “(”.  IMO
maybe_ne is more readable than !known_eq.  The return statement
should only be indented by 4 columns.

> +  if(!(code == WIDEN_MINUS_EXPR
> +       || code == WIDEN_PLUS_EXPR
> +       || code == WIDEN_MULT_EXPR
> +       || code == WIDEN_LSHIFT_EXPR))
> +    return false;
> +
> +  switch (code)
> +  {
> +    case WIDEN_LSHIFT_EXPR:
> +      *code1 = LSHIFT_EXPR;
> +      break;
> +    case WIDEN_MINUS_EXPR:
> +      *code1 = MINUS_EXPR;
> +      break;
> +    case WIDEN_PLUS_EXPR:
> +      *code1 = PLUS_EXPR;
> +      break;
> +    case WIDEN_MULT_EXPR:
> +      *code1 = MULT_EXPR;
> +      break;
> +    default:
> +      gcc_unreachable ();

I think it would be better to return false in the default case and remove
the “if” above, so that we only have one copy of the list of codes.

> +  }

Formatting nit: the { and } lines should be indented two spaces more.
(The contents of the switch are OK as-is.)

> +
> +  if (!supportable_convert_operation(NOP_EXPR, vectype_out, vectype_in,
> +                                  &dummy_code))

Nit: should be a space between the function name and ”(”.

> +    return false;
> +
> +  op = optab_for_tree_code (*code1, vectype_out, optab_vector);
> +  return (optab_handler (op, TYPE_MODE (vectype_out)) != CODE_FOR_nothing);
> +}
> +
>  /* Function supportable_convert_operation
>
>     Check whether an operation represented by the code CODE is a

> […]
> @@ -4697,7 +4755,13 @@ vectorizable_conversion (vec_info *vinfo,
>    nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
>    nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
>    if (known_eq (nunits_out, nunits_in))
> -    modifier = NONE;
> +    if (code == WIDEN_MINUS_EXPR
> +     || code == WIDEN_PLUS_EXPR
> +     || code == WIDEN_LSHIFT_EXPR
> +     || code == WIDEN_MULT_EXPR)
> +     modifier = WIDEN;

The “modifier = WIDEN;” line is indented by a tab (i.e. 8 columns),
but it should only be indented by 6 spaces.

> +    else
> +      modifier = NONE;
>    else if (multiple_p (nunits_out, nunits_in))
>      modifier = NARROW;
>    else
> @@ -4743,9 +4807,16 @@ vectorizable_conversion (vec_info *vinfo,
>        return false;
>
>      case WIDEN:
> -      if (supportable_widening_operation (vinfo, code, stmt_info, 
> vectype_out,
> -                                       vectype_in, &code1, &code2,
> -                                       &multi_step_cvt, &interm_types))
> +      if (supportable_half_widening_operation (code, vectype_out, vectype_in,
> +                                         &code1))
> +     {
> +       gcc_assert (!(multi_step_cvt && op_type == binary_op));
> +       break;
> +     }

I think here we should keep the known_eq test from earlier and not
go into the code below for that case.  I.e. something like:

       if (known_eq (…))
         {
           if (!supportable_half_widening_operation (code, vectype_out,
                                                     vectype_in, &code1))
             goto unsupported;

           gcc_assert (!(multi_step_cvt && op_type == binary_op));
           break;
         }

This is because the fallback code later in the case statement doesn't
really apply here.  It also makes it easier to correlate this test
with the transform code below.

> +      else if (supportable_widening_operation (vinfo, code, stmt_info,
> +                                            vectype_out, vectype_in, &code1,
> +                                            &code2, &multi_step_cvt,
> +                                            &interm_types))

After the change above, this would stay as a plain “if”.

>        {
>          /* Binary widening operation can only be supported directly by the
>             architecture.  */
> @@ -4981,10 +5052,16 @@ vectorizable_conversion (vec_info *vinfo,
>              c1 = codecvt1;
>              c2 = codecvt2;
>            }
> -       vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> -                                               &vec_oprnds1, stmt_info,
> -                                               this_dest, gsi,
> -                                               c1, c2, op_type);
> +       if (known_eq(nunits_out, nunits_in))

Should be a space between “known_eq” and “(”.

> +         vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> +                                                 &vec_oprnds1, stmt_info,
> +                                                 this_dest, gsi,
> +                                                 c1, op_type);

Maybe it would be clearer to call this vect_create_half_widening_stmts,
to match the new query function.

Thanks,
Richard

> +       else
> +         vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
> +                                                 &vec_oprnds1, stmt_info,
> +                                                 this_dest, gsi,
> +                                                 c1, c2, op_type);
>        }
>
>        FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
From b80f8164e58ce470754d063b560b79a10be8e724 Mon Sep 17 00:00:00 2001
From: Joel Hutton <joel.hut...@arm.com>
Date: Fri, 5 Feb 2021 12:52:12 +0000
Subject: [PATCH] [aarch64][vect] Support V8QI->V8HI WIDEN_ patterns

In the case where 8 out of every 16 elements are widened using a
widening pattern and the next 8 are skipped, the patterns are not
recognized. This is because they are normally used in a pair, such  as
VEC_WIDEN_MINUS_HI/LO, to achieve a v16qi->v16hi conversion for example.
This patch adds support for V8QI->V8HI patterns.

gcc/ChangeLog:

    PR tree-optimisation/98772
    * optabs-tree.c (supportable_half_widening_operation): New function
    to check for supportable V8QI->V8HI widening patterns.
    * optabs-tree.h (supportable_half_widening_operation): New function.
    * tree-vect-stmts.c (vect_create_half_widening_stmts): New function
    to create promotion stmts for V8QI->V8HI widening patterns.
    (vectorizable_conversion): Add case for V8QI->V8HI.

gcc/testsuite/ChangeLog:

    PR tree-optimisation/98772
    * gcc.target/aarch64/pr98772.c: New test.
---
 gcc/optabs-tree.c                          |  70 ++++++++++
 gcc/optabs-tree.h                          |   3 +
 gcc/testsuite/gcc.target/aarch64/pr98772.c | 155 +++++++++++++++++++++
 gcc/tree-vect-stmts.c                      |  95 +++++++++++--
 4 files changed, 315 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr98772.c

diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c
index c94073e3ed9..02ed139c180 100644
--- a/gcc/optabs-tree.c
+++ b/gcc/optabs-tree.c
@@ -277,6 +277,76 @@ optab_for_tree_code (enum tree_code code, const_tree type,
     }
 }
 
+/* Check whether an operation represented by CODE is a 'half' widening operation
+   in which the input vector type has half the number of bits of the output
+   vector type e.g. V8QI->V8HI.
+
+   This is handled by widening the inputs using NOP_EXPRs then using a
+   non-widening stmt e.g. MINUS_EXPR.  RTL fusing converts these to the widening
+   hardware instructions if supported.
+
+   The more typical case (handled in supportable_widening_operation) is where
+   the input vector type has the same number of bits as the output vector type.
+   In this case half the elements of the input vectors must be processed at a
+   time into respective vector outputs with elements twice as wide i.e. a
+   'hi'/'lo' pair using codes such as VEC_WIDEN_MINUS_HI/LO.
+
+   Supported widening operations:
+    WIDEN_MINUS_EXPR
+    WIDEN_PLUS_EXPR
+    WIDEN_MULT_EXPR
+    WIDEN_LSHIFT_EXPR
+
+   Output:
+   - CODE1 - The non-widened code, which will be used after the inputs are
+     converted to the wide type.  */
+bool
+supportable_half_widening_operation (enum tree_code code,
+			       tree vectype_out, tree vectype_in,
+			       enum tree_code *code1)
+{
+  machine_mode m1,m2;
+  enum tree_code dummy_code;
+  optab op;
+
+  gcc_assert (VECTOR_TYPE_P (vectype_out) && VECTOR_TYPE_P (vectype_in));
+
+  m1 = TYPE_MODE (vectype_out);
+  m2 = TYPE_MODE (vectype_in);
+
+  if (!VECTOR_MODE_P (m1) || !VECTOR_MODE_P (m2))
+    return false;
+
+  if (maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_in),
+		  TYPE_VECTOR_SUBPARTS (vectype_out)))
+    return false;
+
+  switch (code)
+    {
+    case WIDEN_LSHIFT_EXPR:
+      *code1 = LSHIFT_EXPR;
+      break;
+    case WIDEN_MINUS_EXPR:
+      *code1 = MINUS_EXPR;
+      break;
+    case WIDEN_PLUS_EXPR:
+      *code1 = PLUS_EXPR;
+      break;
+    case WIDEN_MULT_EXPR:
+      *code1 = MULT_EXPR;
+      break;
+    default:
+      return false;
+    }
+
+  if (!supportable_convert_operation (NOP_EXPR, vectype_out, vectype_in,
+				     &dummy_code))
+    return false;
+
+  op = optab_for_tree_code (*code1, vectype_out, optab_vector);
+  return (optab_handler (op, TYPE_MODE (vectype_out)) != CODE_FOR_nothing);
+}
+
 /* Function supportable_convert_operation
 
    Check whether an operation represented by the code CODE is a
diff --git a/gcc/optabs-tree.h b/gcc/optabs-tree.h
index 173da0d3bd2..c3aaa1a4169 100644
--- a/gcc/optabs-tree.h
+++ b/gcc/optabs-tree.h
@@ -36,6 +36,9 @@ enum optab_subtype
    the second argument.  The third argument distinguishes between the types of
    vector shifts and rotates.  */
 optab optab_for_tree_code (enum tree_code, const_tree, enum optab_subtype);
+bool
+supportable_half_widening_operation (enum tree_code, tree, tree,
+				    enum tree_code *);
 bool supportable_convert_operation (enum tree_code, tree, tree,
 				    enum tree_code *);
 bool expand_vec_cmp_expr_p (tree, tree, enum tree_code);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr98772.c b/gcc/testsuite/gcc.target/aarch64/pr98772.c
new file mode 100644
index 00000000000..663221514e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr98772.c
@@ -0,0 +1,155 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps" } */
+#include <stdint.h>
+#include <string.h>
+
+#define DSIZE 16
+#define PIXSIZE 64
+
+extern void
+wplus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for (int y = 0; y < 4; y++ )
+    {
+	for (int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] + pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+extern void __attribute__((optimize (0)))
+wplus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for (int y = 0; y < 4; y++ )
+    {
+	for (int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] + pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+
+extern void
+wminus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for (int y = 0; y < 4; y++ )
+    {
+	for (int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] - pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+extern void __attribute__((optimize (0)))
+wminus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for (int y = 0; y < 4; y++ )
+    {
+	for (int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] - pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+
+extern void
+wmult (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for (int y = 0; y < 4; y++ )
+    {
+	for (int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] * pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+extern void __attribute__((optimize (0)))
+wmult_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
+{
+    for (int y = 0; y < 4; y++ )
+    {
+	for (int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] * pix2[x];
+	pix1 += 16;
+	pix2 += 16;
+    }
+}
+
+extern void
+wlshift (uint16_t *d, uint8_t *restrict pix1)
+
+{
+    for (int y = 0; y < 4; y++ )
+    {
+	for (int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] << 8;
+	pix1 += 16;
+    }
+}
+extern void __attribute__((optimize (0)))
+wlshift_no_opt (uint16_t *d, uint8_t *restrict pix1)
+
+{
+    for (int y = 0; y < 4; y++ )
+    {
+	for (int x = 0; x < 4; x++ )
+	    d[x + y*4] = pix1[x] << 8;
+	pix1 += 16;
+    }
+}
+
+void __attribute__((optimize (0)))
+init_arrays (uint16_t *d_a, uint16_t *d_b, uint8_t *pix1, uint8_t *pix2)
+{
+  for (int i = 0; i < DSIZE; i++)
+  {
+    d_a[i] = (1074 * i)%17;
+    d_b[i] = (1074 * i)%17;
+  }
+  for (int i = 0; i < PIXSIZE; i++)
+  {
+    pix1[i] = (1024 * i)%17;
+    pix2[i] = (1024 * i)%17;
+  }
+}
+
+/* Don't optimize main so we don't get confused over where the vector
+   instructions are generated. */
+__attribute__((optimize (0)))
+int main ()
+{
+  uint16_t d_a[DSIZE];
+  uint16_t d_b[DSIZE];
+  uint8_t pix1[PIXSIZE];
+  uint8_t pix2[PIXSIZE];
+
+  init_arrays (d_a, d_b, pix1, pix2);
+  wplus (d_a, pix1, pix2);
+  wplus_no_opt (d_b, pix1, pix2);
+  if (memcmp (d_a,d_b, DSIZE) != 0)
+    return 1;
+
+  init_arrays (d_a, d_b, pix1, pix2);
+  wminus (d_a, pix1, pix2);
+  wminus_no_opt (d_b, pix1, pix2);
+  if (memcmp (d_a,d_b, DSIZE) != 0)
+    return 2;
+
+  init_arrays (d_a, d_b, pix1, pix2);
+  wmult (d_a, pix1, pix2);
+  wmult_no_opt (d_b, pix1, pix2);
+  if (memcmp (d_a,d_b, DSIZE) != 0)
+    return 3;
+
+  init_arrays (d_a, d_b, pix1, pix2);
+  wlshift (d_a, pix1);
+  wlshift_no_opt (d_b, pix1);
+  if (memcmp (d_a,d_b, DSIZE) != 0)
+    return 4;
+
+}
+
+/* { dg-final { scan-assembler-times "uaddl\\tv" 2 } } */
+/* { dg-final { scan-assembler-times "usubl\\tv" 2 } } */
+/* { dg-final { scan-assembler-times "umull\\tv" 2 } } */
+/* { dg-final { scan-assembler-times "shl\\tv" 2 } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index f180ced3124..bf23499f7e3 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -4545,6 +4545,64 @@ vect_create_vectorized_promotion_stmts (vec_info *vinfo,
   *vec_oprnds0 = vec_tmp;
 }
 
+/* Create vectorized promotion stmts for widening stmts using only half the
+   potential vector size for input.  */
+static void
+vect_create_half_widening_stmts (vec_info *vinfo,
+					vec<tree> *vec_oprnds0,
+					vec<tree> *vec_oprnds1,
+					stmt_vec_info stmt_info, tree vec_dest,
+					gimple_stmt_iterator *gsi,
+					enum tree_code code1,
+					int op_type)
+{
+  int i;
+  tree vop0, vop1;
+  gimple *new_stmt1;
+  gimple *new_stmt2;
+  gimple *new_stmt3;
+  vec<tree> vec_tmp = vNULL;
+
+  vec_tmp.create (vec_oprnds0->length ());
+  FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
+    {
+      tree new_tmp1, new_tmp2, new_tmp3, out_type;
+
+      gcc_assert (op_type == binary_op);
+      vop1 = (*vec_oprnds1)[i];
+
+      /* Widen the first vector input.  */
+      out_type = TREE_TYPE (vec_dest);
+      new_tmp1 = make_ssa_name (out_type);
+      new_stmt1 = gimple_build_assign (new_tmp1, NOP_EXPR, vop0);
+      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt1, gsi);
+      if (VECTOR_TYPE_P (TREE_TYPE (vop1)))
+	{
+	  /* Widen the second vector input.  */
+	  new_tmp2 = make_ssa_name (out_type);
+	  new_stmt2 = gimple_build_assign (new_tmp2, NOP_EXPR, vop1);
+	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt2, gsi);
+	  /* Perform the operation.  With both vector inputs widened.  */
+	  new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, new_tmp2);
+	}
+      else
+	{
+	  /* Perform the operation.  With the single vector input widened.  */
+	  new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, vop1);
+      }
+
+      new_tmp3 = make_ssa_name (vec_dest, new_stmt3);
+      gimple_assign_set_lhs (new_stmt3, new_tmp3);
+      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt3, gsi);
+
+      /* Store the results for the next step.  */
+      vec_tmp.quick_push (new_tmp3);
+    }
+
+  vec_oprnds0->release ();
+  *vec_oprnds0 = vec_tmp;
+}
+
 
 /* Check if STMT_INFO performs a conversion operation that can be vectorized.
    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
@@ -4697,7 +4755,13 @@ vectorizable_conversion (vec_info *vinfo,
   nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
   nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
   if (known_eq (nunits_out, nunits_in))
-    modifier = NONE;
+    if (code == WIDEN_MINUS_EXPR
+	|| code == WIDEN_PLUS_EXPR
+	|| code == WIDEN_LSHIFT_EXPR
+	|| code == WIDEN_MULT_EXPR)
+      modifier = WIDEN;
+    else
+      modifier = NONE;
   else if (multiple_p (nunits_out, nunits_in))
     modifier = NARROW;
   else
@@ -4743,9 +4807,18 @@ vectorizable_conversion (vec_info *vinfo,
       return false;
 
     case WIDEN:
-      if (supportable_widening_operation (vinfo, code, stmt_info, vectype_out,
-					  vectype_in, &code1, &code2,
-					  &multi_step_cvt, &interm_types))
+      if (known_eq (nunits_in, nunits_out))
+	{
+	  if (!supportable_half_widening_operation (code, vectype_out,
+						   vectype_in, &code1))
+	    goto unsupported;
+	  gcc_assert (!(multi_step_cvt && op_type == binary_op));
+	  break;
+	}
+      if (supportable_widening_operation (vinfo, code, stmt_info,
+					       vectype_out, vectype_in, &code1,
+					       &code2, &multi_step_cvt,
+					       &interm_types))
 	{
 	  /* Binary widening operation can only be supported directly by the
 	     architecture.  */
@@ -4981,10 +5054,16 @@ vectorizable_conversion (vec_info *vinfo,
 	      c1 = codecvt1;
 	      c2 = codecvt2;
 	    }
-	  vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
-						  &vec_oprnds1, stmt_info,
-						  this_dest, gsi,
-						  c1, c2, op_type);
+	  if (known_eq (nunits_out, nunits_in))
+	    vect_create_half_widening_stmts (vinfo, &vec_oprnds0,
+						    &vec_oprnds1, stmt_info,
+						    this_dest, gsi,
+						    c1, op_type);
+	  else
+	    vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
+						    &vec_oprnds1, stmt_info,
+						    this_dest, gsi,
+						    c1, c2, op_type);
 	}
 
       FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
-- 
2.17.1

Reply via email to