Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-06-14 Thread James Greenhalgh
On Wed, Jun 14, 2017 at 08:53:27AM +, Hurugalawadi, Naveen wrote:
> Hi James,
> 
> >> Could you make the testcase a bit more comprehensive? 
> 
> Modified the testcase considering all the possible cases.
> Split up the test based on different scenarios.
> 
> Please review the patch and let us know if its okay?

OK with an appropriate ChangeLog. Thanks for the patch!

Thanks,
James




Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-06-14 Thread Hurugalawadi, Naveen
Hi James,

>> Could you make the testcase a bit more comprehensive? 

Modified the testcase considering all the possible cases.
Split up the test based on different scenarios.

Please review the patch and let us know if its okay?

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index bce490f..239ba72 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11707,6 +11707,57 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   return;
 }
 
+  enum insn_code icode = optab_handler (vec_set_optab, mode);
+  gcc_assert (icode != CODE_FOR_nothing);
+
+  /* If there are only variable elements, try to optimize
+ the insertion using dup for the most common element
+ followed by insertions.  */
+
+  /* The algorithm will fill matches[*][0] with the earliest matching element,
+ and matches[X][1] with the count of duplicate elements (if X is the
+ earliest element which has duplicates).  */
+
+  if (n_var == n_elts && n_elts <= 16)
+{
+  int matches[16][2] = {0};
+  for (int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	{
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  break;
+		}
+	}
+	}
+  int maxelement = 0;
+  int maxv = 0;
+  for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  {
+	maxelement = i;
+	maxv = matches[i][1];
+	  }
+
+  /* Create a duplicate of the most common element.  */
+  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+  aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+
+  /* Insert the rest.  */
+  for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+  return;
+}
+
   /* Initialise a vector which is part-variable.  We want to first try
  to build those lanes which are constant in the most efficient way we
  can.  */
@@ -11740,10 +11791,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 }
 
   /* Insert the variable lanes directly.  */
-
-  enum insn_code icode = optab_handler (vec_set_optab, mode);
-  gcc_assert (icode != CODE_FOR_nothing);
-
   for (int i = 0; i < n_elts; i++)
 {
   rtx x = XVECEXP (vals, 0, i);
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-init-1.c b/gcc/testsuite/gcc.target/aarch64/vect-init-1.c
new file mode 100644
index 000..90ba3ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-init-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float c, float d)
+{
+  return (vector float) { a, b, c, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-init-2.c b/gcc/testsuite/gcc.target/aarch64/vect-init-2.c
new file mode 100644
index 000..0444675
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-init-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float d)
+{
+  return (vector float) { a, b, a, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-init-3.c b/gcc/testsuite/gcc.target/aarch64/vect-init-3.c
new file mode 100644
index 000..b5822b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-init-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b)
+{
+  return (vector float) { a, b, a, b };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-init-4.c b/gcc/testsuite/gcc.target/aarch64/vect-init-4.c
new file mode 100644
index 000..09a0095
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-init-4.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b)
+{
+  return (vector float) { a, b, b, a };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-init-5.c b/gcc/testsuite/gcc.target/aarch64/vect-init-5.c
new file mode 100644
index 000..76d5502
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-init-5.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine 

Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-06-13 Thread James Greenhalgh
On Tue, Jun 13, 2017 at 10:24:59AM +, Hurugalawadi, Naveen wrote:
> Hi James,
> 
> Thanks for your review and useful comments.
> 
> >> If you could try to keep one reply chain for each patch series
> Will keep that in mind for sure :-)
> 
> >> Very minor, but what is wrong with:
> >> int matches[16][2] = {0};
> Done.
> 
> >> nummatches is unused.
> Removed.
> 
> >> This search algorithm is tough to follow
> Updated as per your comments.
> 
> >> Put braces round this and write it as two statements
> Done.
> 
> >> Move your new code above the part-variable case.
> Done.
> 
> >> c is unused.
> Removed.
> 
> Bootstrapped and Regression tested on aarch64-thunder-linux.
> 
> Please review the patch and let us know if any comments or suggestions.

Almost OK. Could you make the testcase a bit more comprehensive? Test
that the right thing happens with multiple duplicates, that the right thing
happens when there are no duplicates, etc. At the moment the test does not
provide good coverage of the cases your code handles.

With a fuller testcase this will likely be OK, but please repost the patch
for another review.

Thanks,
James

> diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c 
> b/gcc/testsuite/gcc.target/aarch64/pr71663.c
> new file mode 100644
> index 000..65f368d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#define vector __attribute__((vector_size(16)))
> +
> +vector float combine (float a, float b, float d)
> +{
> +  return (vector float) { a, b, a, d };
> +}
> +
> +/* { dg-final { scan-assembler-not "movi\t" } } */
> +/* { dg-final { scan-assembler-not "orr\t" } } */
> +/* { dg-final { scan-assembler-times "ins\t" 2 } } */
> +/* { dg-final { scan-assembler-times "dup\t" 1 } } */



Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-06-13 Thread Hurugalawadi, Naveen
Hi James,

Thanks for your review and useful comments.

>> If you could try to keep one reply chain for each patch series
Will keep that in mind for sure :-)

>> Very minor, but what is wrong with:
>> int matches[16][2] = {0};
Done.

>> nummatches is unused.
Removed.

>> This search algorithm is tough to follow
Updated as per your comments.

>> Put braces round this and write it as two statements
Done.

>> Move your new code above the part-variable case.
Done.

>> c is unused.
Removed.

Bootstrapped and Regression tested on aarch64-thunder-linux.

Please review the patch and let us know if any comments or suggestions.

Thanks,
Naveen
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index bce490f..239ba72 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11707,6 +11707,57 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   return;
 }
 
+  enum insn_code icode = optab_handler (vec_set_optab, mode);
+  gcc_assert (icode != CODE_FOR_nothing);
+
+  /* If there are only variable elements, try to optimize
+ the insertion using dup for the most common element
+ followed by insertions.  */
+
+  /* The algorithm will fill matches[*][0] with the earliest matching element,
+ and matches[X][1] with the count of duplicate elements (if X is the
+ earliest element which has duplicates).  */
+
+  if (n_var == n_elts && n_elts <= 16)
+{
+  int matches[16][2] = {0};
+  for (int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	{
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  break;
+		}
+	}
+	}
+  int maxelement = 0;
+  int maxv = 0;
+  for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  {
+	maxelement = i;
+	maxv = matches[i][1];
+	  }
+
+  /* Create a duplicate of the most common element.  */
+  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+  aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+
+  /* Insert the rest.  */
+  for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+  return;
+}
+
   /* Initialise a vector which is part-variable.  We want to first try
  to build those lanes which are constant in the most efficient way we
  can.  */
@@ -11740,10 +11791,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 }
 
   /* Insert the variable lanes directly.  */
-
-  enum insn_code icode = optab_handler (vec_set_optab, mode);
-  gcc_assert (icode != CODE_FOR_nothing);
-
   for (int i = 0; i < n_elts; i++)
 {
   rtx x = XVECEXP (vals, 0, i);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c b/gcc/testsuite/gcc.target/aarch64/pr71663.c
new file mode 100644
index 000..65f368d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float d)
+{
+  return (vector float) { a, b, a, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
+/* { dg-final { scan-assembler-times "ins\t" 2 } } */
+/* { dg-final { scan-assembler-times "dup\t" 1 } } */


Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-06-09 Thread James Greenhalgh
On Wed, Apr 26, 2017 at 06:34:36AM +, Hurugalawadi, Naveen wrote:
> Hi Kyrill,
> 
> Thanks for the review and your comments.
> 
> >> It would be useful if you expanded a bit on the approach used to
> >> generate the improved codegen
> 
> The patch creates a duplicate of most common element and tries to optimize
> the insertion using dup for the element followed by insertions.
> 
> Current code:
> 
> moviv2.4s, 0
> ins v2.s[0], v0.s[0]
> ins v2.s[1], v1.s[0]
> ins v2.s[2], v0.s[0]
> orr v0.16b, v2.16b, v2.16b
> ins v0.s[3], v3.s[0]
> ret
> 
> 
> Code after the patch:
> 
> dup v0.4s, v0.s[0]
> ins v0.s[1], v1.s[0]
> ins v0.s[3], v3.s[0]
> ret
> 
> 
> >> Some typos
> 
> Modified as required
> 
> >> worth adding a testcase where one of the vector elements appears more than
> >> the others?
> 
> Modified the testcase as required using common element.
> 
> Please review the patch and let us know if its okay?
> Bootstrapped and Regression tested on aarch64-thunder-linux.

This patch fell through the cracks as it shows up in a reply chain with a
few others. If you could try to keep one reply chain for each patch series
you're submitting, that would make tracking the submissions much
easier :-).


> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 2e385c4..8747a23 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -11671,11 +11671,54 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>aarch64_expand_vector_init (target, copy);
>  }
>  
> -  /* Insert the variable lanes directly.  */
> -
>enum insn_code icode = optab_handler (vec_set_optab, mode);
>gcc_assert (icode != CODE_FOR_nothing);
>  
> +  /* If there are only variable elements, try to optimize
> + the insertion using dup for the most common element
> + followed by insertions.  */
> +  if (n_var == n_elts && n_elts <= 16)
> +{
> +  int matches[16][2];
> +  int nummatches = 0;
> +  memset (matches, 0, sizeof(matches));

Very minor, but what is wrong with:

  int matches[16][2] = {0};

Rather than the explicit memset?

> +  for(int i = 0; i < n_elts; i++)
> + {
> +   for (int j = 0; j <= i; j++)
> + {
> +   if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
> + {
> +   matches[i][0] = j;
> +   matches[j][1]++;
> +   if (i != j)
> + nummatches++;

nummatches is unused.

This search algorithm is tough to follow. A comment explaining that you will
fill matches[*][0] with the earliest matching element, and matches[X][1]
with the count of duplicate elements (if X is the earliest element which has
duplicates). Would be useful to understand exactly what you're aiming for.
Certainly it took me a while to understand that for:

  { a, b, c, b, b, d, c, e }

matches would look like:

  { { 0, 1 },
{ 1, 3 },
{ 2. 2 },
{ 1, 0 },
{ 1, 0 },
{ 5, 1 },
{ 2, 0 },
{ 7, 1 } }

> + }
> + }
> + }
> +  int maxelement = 0;
> +  int maxv = 0;
> +  for (int i = 0; i < n_elts; i++)
> + if (matches[i][1] > maxv)
> +   maxelement = i, maxv = matches[i][1];

Put braces round this and write it as two statements, or eliminate the use of
maxv and use matches[i][1] > matches[maxelement][1], but don't use the comma
operator like this please.

> +  /* Create a duplicate of the most common element.  */
> +  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
> +  aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
> +
> +  /* Insert the rest.  */
> +  for (int i = 0; i < n_elts; i++)
> + {
> +   rtx x = XVECEXP (vals, 0, i);
> +   if (matches[i][0] == maxelement)
> + continue;
> +   x = copy_to_mode_reg (inner_mode, x);
> +   emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
> + }
> +  return;
> +}
> +
> +  /* Insert the variable lanes directly.  */

This code would read better if you rearranged the cases. As it stands your
code is added in the middle of a logical operation (deal with constant lanes,
then deal with variable lanes). Move your new code above the part-variable
case.

>for (int i = 0; i < n_elts; i++)
>  {
>rtx x = XVECEXP (vals, 0, i);
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c 
> b/gcc/testsuite/gcc.target/aarch64/pr71663.c
> new file mode 100644
> index 000..a043a21
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#define vector __attribute__((vector_size(16)))
> +
> +vector 

Re: [PING 2] [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-05-26 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01260.html

Thanks,
Naveen





Re: [PING] [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-05-10 Thread Hurugalawadi, Naveen
Hi,  

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.  

https://gcc.gnu.org/ml/gcc-patches/2017-04/msg01260.html

Thanks,
Naveen





Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-04-26 Thread Hurugalawadi, Naveen
Hi Kyrill,

Thanks for the review and your comments.

>> It would be useful if you expanded a bit on the approach used to
>> generate the improved codegen

The patch creates a duplicate of most common element and tries to optimize
the insertion using dup for the element followed by insertions.

Current code:

moviv2.4s, 0
ins v2.s[0], v0.s[0]
ins v2.s[1], v1.s[0]
ins v2.s[2], v0.s[0]
orr v0.16b, v2.16b, v2.16b
ins v0.s[3], v3.s[0]
ret


Code after the patch:

dup v0.4s, v0.s[0]
ins v0.s[1], v1.s[0]
ins v0.s[3], v3.s[0]
ret


>> Some typos

Modified as required

>> worth adding a testcase where one of the vector elements appears more than
>> the others?

Modified the testcase as required using common element.

Please review the patch and let us know if its okay?
Bootstrapped and Regression tested on aarch64-thunder-linux.

Thanks,
Naveendiff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e385c4..8747a23 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11671,11 +11671,54 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   aarch64_expand_vector_init (target, copy);
 }
 
-  /* Insert the variable lanes directly.  */
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
+  /* If there are only variable elements, try to optimize
+ the insertion using dup for the most common element
+ followed by insertions.  */
+  if (n_var == n_elts && n_elts <= 16)
+{
+  int matches[16][2];
+  int nummatches = 0;
+  memset (matches, 0, sizeof(matches));
+  for(int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	{
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  if (i != j)
+		nummatches++;
+		  break;
+		}
+	}
+	}
+  int maxelement = 0;
+  int maxv = 0;
+  for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  maxelement = i, maxv = matches[i][1];
+
+  /* Create a duplicate of the most common element.  */
+  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+  aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+
+  /* Insert the rest.  */
+  for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+  return;
+}
+
+  /* Insert the variable lanes directly.  */
   for (int i = 0; i < n_elts; i++)
 {
   rtx x = XVECEXP (vals, 0, i);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c b/gcc/testsuite/gcc.target/aarch64/pr71663.c
new file mode 100644
index 000..a043a21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float c, float d)
+{
+  return (vector float) { a, b, a, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
+/* { dg-final { scan-assembler-times "ins\t" 2 } } */
+/* { dg-final { scan-assembler-times "dup\t" 1 } } */


Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-04-25 Thread Kyrill Tkachov

Hi Naveen,

On 09/12/16 07:02, Hurugalawadi, Naveen wrote:

Hi,

Sorry. Missed out the testcase in patch submission.
Added the missing testcase along with the ChangeLog.
Please review the same and let us know if thats okay?


It would be useful if you expanded a bit on the approach used to
generate the improved codegen, or at least show for the testcase
what code was generated before this patch and what is generated
after this patch.


2016-12-09  Andrew PInski  

gcc
 * config/aarch64/aarch64.c (aarch64_expand_vector_init):
 Improve vector initialization code gen.
gcc/testsuite
 * gcc.target/aarch64/pr71663.c: New Testcase.


+  /* If there is only varables, try to optimize
+ the inseration using dup for the most common element
+ followed by insertations. */

Some typos: s/is only varables/are only variable elements/, 
s/inseration/insertion/,
s/insertations/insertions/.

 +  if (n_var == n_elts && n_elts <= 16)


diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c 
b/gcc/testsuite/gcc.target/aarch64/pr71663.c
new file mode 100644
index 000..c8df847
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float c, float d)
+{
+  return (vector float) { a, b, c, d };
+}

A large part of the aarch64.c hunk of your patch deals with finding the most 
commonly-occuring
element in the vector of variables, yet in your testcase all variables appear 
exactly once.
Perhaps worth adding a testcase where one of the vector elements appears more 
than the others?
I'd guess the codegen then would be better with this patch?

Cheers,
Kyrill



[PING] [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-04-25 Thread Hurugalawadi, Naveen
Hi,

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.

https://gcc.gnu.org/ml/gcc-patches/2016-12/msg00718.html

Thanks,
Naveen






Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-02-05 Thread Hurugalawadi, Naveen
Hi,

Please consider this as a personal reminder to review the patch
at following link and let me know your comments on the same.

https://gcc.gnu.org/ml/gcc-patches/2016-12/msg00718.html

Thanks,
Naveen






Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2016-12-08 Thread Hurugalawadi, Naveen
Hi,

Sorry. Missed out the testcase in patch submission.
Added the missing testcase along with the ChangeLog.
Please review the same and let us know if thats okay?

2016-12-09  Andrew PInski  

gcc
    * config/aarch64/aarch64.c (aarch64_expand_vector_init):
    Improve vector initialization code gen.
gcc/testsuite
* gcc.target/aarch64/pr71663.c: New Testcase.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e87831f..da5b6fa 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11609,11 +11609,54 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   aarch64_expand_vector_init (target, copy);
 }
 
-  /* Insert the variable lanes directly.  */
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
+  /* If there is only varables, try to optimize
+ the inseration using dup for the most common element
+ followed by insertations. */
+  if (n_var == n_elts && n_elts <= 16)
+{
+  int matches[16][2];
+  int nummatches = 0;
+  memset (matches, 0, sizeof(matches));
+  for(int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	{
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  if (i != j)
+		nummatches++;
+		  break;
+		}
+	}
+	}
+  int maxelement = 0;
+  int maxv = 0;
+  for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  maxelement = i, maxv = matches[i][1];
+
+  /* Create a duplicate of the most common element. */
+  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+  aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+  /* Insert the rest. */
+  for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+  return;
+}
+
+  /* Insert the variable lanes directly.  */
+
   for (int i = 0; i < n_elts; i++)
 {
   rtx x = XVECEXP (vals, 0, i);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c b/gcc/testsuite/gcc.target/aarch64/pr71663.c
new file mode 100644
index 000..c8df847
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float c, float d)
+{
+  return (vector float) { a, b, c, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
+/* { dg-final { scan-assembler-times "ins\t" 3 } } */
+/* { dg-final { scan-assembler-times "dup\t" 1 } } */


[PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2016-12-08 Thread Hurugalawadi, Naveen
Hi,

The AArch64 vector initialization sequence can be optimized to generate
better code. The attached patch handles for the case where the vector
contains only variables. It checks for the common elements in the vector
and inserts the values in optimized way.

Bootstrapped and Regression tested on aarch64-thunder-linux.
Please review the patch and let us know if its okay?

2016-12-09  Andrew PInski  

gcc
* config/aarch64/aarch64.c (aarch64_expand_vector_init):
Improve vector initialization code gen.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e87831f..da5b6fa 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11609,11 +11609,54 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   aarch64_expand_vector_init (target, copy);
 }
 
-  /* Insert the variable lanes directly.  */
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
+  /* If there is only varables, try to optimize
+ the inseration using dup for the most common element
+ followed by insertations. */
+  if (n_var == n_elts && n_elts <= 16)
+{
+  int matches[16][2];
+  int nummatches = 0;
+  memset (matches, 0, sizeof(matches));
+  for(int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	{
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  if (i != j)
+		nummatches++;
+		  break;
+		}
+	}
+	}
+  int maxelement = 0;
+  int maxv = 0;
+  for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  maxelement = i, maxv = matches[i][1];
+
+  /* Create a duplicate of the most common element. */
+  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+  aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+  /* Insert the rest. */
+  for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+  return;
+}
+
+  /* Insert the variable lanes directly.  */
+
   for (int i = 0; i < n_elts; i++)
 {
   rtx x = XVECEXP (vals, 0, i);