On Thu, Jul 27, 2017 at 06:49:01PM +0100, James Greenhalgh wrote:
>
> On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote:
> > [Sorry for the re-send. I spotted that the attributes were not right for the
> >  new pattern I was adding. The change between this and the first version 
> > was:
> >
> >   +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> >   +   (set_attr "length" "4,4,4,12")]
> > ]
> >
> > ---
> >
> > Hi,
> >
> > In this testcase, all argument registers and the return register
> > will be general purpose registers:
> >
> >   long long
> >   foo (long long a, long long b, long long c)
> >   {
> >     return ((a ^ b) & c) ^ b;
> >   }
> >
> > However, due to the implementation of aarch64_simd_bsl<mode>_internal
> > we'll match that pattern and emit a BSL, necessitating moving all those
> > arguments and results to the Advanced SIMD registers:
> >
> >     fmov    d2, x0
> >     fmov    d0, x2
> >     fmov    d1, x1
> >     bsl     v0.8b, v2.8b, v1.8b
> >     fmov    x0, d0
> >
> > To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split 
> > that
> > knows to split back to integer operations if the register allocation
> > falls that way.
> >
> > We could have used an unspec, but then we lose some of the nice
> > simplifications that can be made from explicitly spelling out the semantics
> > of BSL.
>
> Off list, Richard and I found considerable issues with this patch. From
> idioms failing to match in the testcase, to subtle register allocation bugs,
> to potential for suboptimal code generation.
>
> That's not good!
>
> This spin of the patch corrects those issues by adding a second split
> pattern, allocating a temporary register if we're permitted to, or
> properly using tied output operands if we're not, and by generally playing
> things a bit safer around the possibility of register overlaps.
>
> The testcase is expanded to an execute test, hopefully giving a little
> more assurance that we're doing the right thing - now testing the BSL idiom
> generation in both general-purpose and floating-point registers and comparing
> the results. Hopefully we're now a bit more robust!
>
> Bootstrapped and tested on aarch64-none-linux-gnu and cross-tested on
> aarch64-none-elf with no issues.
>
> OK for trunk?

I'd like to ping this patch for review (I would like the explicit review as
an earlier revision had bugs), attached is a rebase of this patch on trunk.

OK? If so, please commit it on my behalf as I will be out of office for a
few weeks for some time off.

Thanks,
James

---
gcc/

2017-10-04  James Greenhalgh  <james.greenha...@arm.com>

        * config/aarch64/aarch64-simd.md
        (aarch64_simd_bsl<mode>_internal): Remove DImode.
        (*aarch64_simd_bsl<mode>_alt): Likewise.
        (aarch64_simd_bsldi_internal): New.
        (aarch64_simd_bsldi_alt): Likewise.

gcc/testsuite/

2017-10-04  James Greenhalgh  <james.greenha...@arm.com>

        * gcc.target/aarch64/bsl-idiom.c: New.
        * gcc.target/aarch64/copysign-bsl.c: New.

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 70e9339..1888d2f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2322,13 +2322,13 @@
 ;; in *aarch64_simd_bsl<mode>_alt.
 
 (define_insn "aarch64_simd_bsl<mode>_internal"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
 	       (match_operand:<V_INT_EQUIV> 3 "register_operand" "w,0,w")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
-	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
+	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
+	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
 	  (match_dup:<V_INT_EQUIV> 3)
 	))]
   "TARGET_SIMD"
@@ -2346,14 +2346,14 @@
 ;; permutations of commutative operations, we have to have a separate pattern.
 
 (define_insn "*aarch64_simd_bsl<mode>_alt"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
-	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
-	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
-	  (match_dup:VSDQ_I_DI 2)))]
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
+	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
+	       (match_operand:<V_INT_EQUIV> 2 "register_operand" "w,0,w"))
+	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
+	  (match_dup:<V_INT_EQUIV> 2)))]
   "TARGET_SIMD"
   "@
   bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
@@ -2362,6 +2362,100 @@
   [(set_attr "type" "neon_bsl<q>")]
 )
 
+;; DImode is special, we want to avoid computing operations which are
+;; more naturally computed in general purpose registers in the vector
+;; registers.  If we do that, we need to move all three operands from general
+;; purpose registers to vector registers, then back again.  However, we
+;; don't want to make this pattern an UNSPEC as we'd lose scope for
+;; optimizations based on the component operations of a BSL.
+;;
+;; That means we need a splitter back to the individual operations, if they
+;; would be better calculated on the integer side.
+
+(define_insn_and_split "aarch64_simd_bsldi_internal"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+	(xor:DI
+	   (and:DI
+	     (xor:DI
+	       (match_operand:DI 3 "register_operand" "w,0,w,r")
+	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
+	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
+	  (match_dup:DI 3)
+	))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %2.8b, %3.8b
+  bit\\t%0.8b, %2.8b, %1.8b
+  bif\\t%0.8b, %3.8b, %1.8b
+  #"
+  "&& GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 1) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  If we're before reload, and
+     able to create a temporary register, do so.  If we're after reload,
+     we've got an early-clobber destination register, so use that.
+     Otherwise, we can't create pseudos and we can't yet guarantee that
+     operands[0] is safe to write, so FAIL to split.  */
+
+  rtx scratch;
+  if (reload_completed)
+    scratch = operands[0];
+  else if (can_create_pseudo_p ())
+    scratch = gen_reg_rtx (DImode);
+  else
+    FAIL;
+
+  emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
+  emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
+  emit_insn (gen_xordi3 (operands[0], scratch, operands[3]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
+(define_insn_and_split "aarch64_simd_bsldi_alt"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+	(xor:DI
+	   (and:DI
+	     (xor:DI
+	       (match_operand:DI 3 "register_operand" "w,w,0,r")
+	       (match_operand:DI 2 "register_operand" "w,0,w,r"))
+	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
+	  (match_dup:DI 2)
+	))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %3.8b, %2.8b
+  bit\\t%0.8b, %3.8b, %1.8b
+  bif\\t%0.8b, %2.8b, %1.8b
+  #"
+  "&& GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  If we're before reload, and
+     able to create a temporary register, do so.  If we're after reload,
+     we've got an early-clobber destination register, so use that.
+     Otherwise, we can't create pseudos and we can't yet guarantee that
+     operands[0] is safe to write, so FAIL to split.  */
+
+  rtx scratch;
+  if (reload_completed)
+    scratch = operands[0];
+  else if (can_create_pseudo_p ())
+    scratch = gen_reg_rtx (DImode);
+  else
+    FAIL;
+
+  emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
+  emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
+  emit_insn (gen_xordi3 (operands[0], scratch, operands[2]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
 (define_expand "aarch64_simd_bsl<mode>"
   [(match_operand:VALLDIF 0 "register_operand")
    (match_operand:<V_INT_EQUIV> 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c b/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c
new file mode 100644
index 0000000..8151387
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c
@@ -0,0 +1,88 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-rtl-combine --save-temps" } */
+
+/* Test that we don't generate BSL when in DImode with values in integer
+   registers, and do generate it where we have values in floating-point
+   registers.  This is useful, as it allows us to avoid register moves
+   in the general case.
+
+   We want:
+	eor	x0, x0, x1
+	and	x0, x0, x2
+	eor	x0, x0, x1
+	ret
+
+   Rather than:
+	fmov	d2, x0
+	fmov	d0, x2
+	fmov	d1, x1
+	bsl	v0.8b, v2.8b, v1.8b
+	fmov	x0, d0
+	ret  */
+
+extern void abort (void);
+
+unsigned long long __attribute__ ((noinline))
+foo (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+  return ((a ^ b) & c) ^ b;
+}
+
+unsigned long long __attribute__ ((noinline))
+foo2 (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+  return ((a ^ b) & c) ^ a;
+}
+
+#define force_simd(V1)   asm volatile ("mov %d0, %1.d[0]"	\
+	   : "=w"(V1)						\
+	   : "w"(V1)						\
+	   : /* No clobbers */);
+
+unsigned long long __attribute__ ((noinline))
+bar (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+  force_simd (a);
+  force_simd (b);
+  force_simd (c);
+  c = ((a ^ b) & c) ^ b;
+  force_simd (c);
+  return c;
+}
+
+unsigned long long __attribute__ ((noinline))
+bar2 (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+  force_simd (a);
+  force_simd (b);
+  force_simd (c);
+  c = ((a ^ b) & c) ^ a;
+  force_simd (c);
+  return c;
+}
+
+int
+main (int argc, char** argv)
+{
+  unsigned long long a = 0x0123456789abcdefULL;
+  unsigned long long b = 0xfedcba9876543210ULL;
+  unsigned long long c = 0xaabbccddeeff7777ULL;
+  if (foo (a, b, c) != bar (a, b, c))
+    abort ();
+  if (foo2 (a, b, c) != bar2 (a, b, c))
+    abort ();
+  return 0;
+}
+
+/* 2 BSL, 6 FMOV (to floating-point registers), and 2 FMOV (to general
+purpose registers) for the "bar" tests, which should still use BSL.  */
+/* { dg-final { scan-assembler-times "bsl\tv\[0-9\]" 2 } } */
+/* { dg-final { scan-assembler-times "fmov\td\[0-9\]" 6 } } */
+/* { dg-final { scan-assembler-times "fmov\tx\[0-9\]" 2 } } */
+
+/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
+
+/* We always match the idiom during combine.  */
+/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_internal" 2 "combine" } } */
+/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_alt" 2 "combine" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c b/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c
new file mode 100644
index 0000000..4e63511
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we can generate DImode BSL when we are using
+   copysign.  */
+
+double
+foo (double a, double b)
+{
+  return __builtin_copysign (a, b);
+}
+
+/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */

Reply via email to