RE: [patch 1/2][aarch64]: redefine aes patterns

2019-07-08 Thread Sylvia Taylor
Hi James,

I forgot to mention that. Yes, please do commit it on my behalf.

Cheers,
Syl


[patch 2/2][arm]: redefine aes patterns

2019-07-05 Thread Sylvia Taylor
Greetings,

This patch removes the arch-common aese/aesmc and aesd/aesimc fusions
(i.e. aes fusion) implemented in the scheduling phase through the
aarch_crypto_can_dual function. The reason is due to observing
undesired behaviour in cases such as:
- when register allocation goes bad (e.g. extra movs)
- aes operations with xor and zeroed keys among interleaved operations

A more stable version should be provided by instead doing the aes fusion
during the combine pass. As such, new combine patterns have been added to
enable this.

The second change is the aese and aesd patterns have been rewritten as
encapsulating a xor operation. The purpose is to simplify the need of
having additional combine patterns for cases like the ones below:

For AESE (though it also applies to AESD as both have a xor operation):

data = data ^ key;
data = vaeseq_u8(data, zero);
---
veorq1, q0, q1
aese.8  q1, q9

Should mean and generate the same as:

data = vaeseq_u8(data, key);
---
aese.8  q1, q0

Bootstrapped and tested on arm-none-linux-gnueabihf.

Cheers,
Syl

gcc/ChangeLog:

2019-07-05  Sylvia Taylor  

* config/arm/crypto.md:
(crypto_): Redefine aese/aesd pattern with xor.
(crypto_): Remove attribute enabled for aesmc.
(crypto_): Split CRYPTO_BINARY into 2 patterns.
(*aarch32_crypto_aese_fused, *aarch32_crypto_aesd_fused): New.
* config/arm/arm.c
(aarch_macro_fusion_pair_p): Remove aes/aesmc fusion check.
* config/arm/aarch-common-protos.h
(aarch_crypto_can_dual_issue): Remove.
* config/arm/aarch-common.c 
(aarch_crypto_can_dual_issue): Likewise.
* config/arm/exynos-m1.md: Remove aese/aesmc fusion.
* config/arm/cortex-a53.md: Likewise.
* config/arm/cortex-a57.md: Likewise.
* config/arm/iterators.md:
(CRYPTO_BINARY): Redefine.
(CRYPTO_UNARY): Removed.
(CRYPTO_AES, CRYPTO_AESMC): New.

gcc/testsuite/ChangeLog:

2019-07-05  Sylvia Taylor  

* gcc.target/arm/aes-fuse-1.c: New.
* gcc.target/arm/aes-fuse-2.c: New.
* gcc.target/arm/aes_xor_combine.c: New.
diff --git a/gcc/config/arm/aarch-common-protos.h 
b/gcc/config/arm/aarch-common-protos.h
index 
11cd5145bbc77ab35e7874a75a93ec0e7bb0ea28..3bf38a104f6941eec1ce88db7d6b6ceb7da0af92
 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -24,7 +24,6 @@
 #define GCC_AARCH_COMMON_PROTOS_H
 
 extern int aarch_accumulator_forwarding (rtx_insn *, rtx_insn *);
-extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *);
 extern bool aarch_rev16_p (rtx);
 extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode);
 extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode);
diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c
index 
c7af12d4cd1714c70ebc6d6c7d4454606d15f864..965a07a43e3129dd1743d4a79813a597feca0b71
 100644
--- a/gcc/config/arm/aarch-common.c
+++ b/gcc/config/arm/aarch-common.c
@@ -31,46 +31,6 @@
 #include "rtl-iter.h"
 #include "memmodel.h"
 
-/* In ARMv8-A there's a general expectation that AESE/AESMC
-   and AESD/AESIMC sequences of the form:
-
-   AESE Vn, _
-   AESMC Vn, Vn
-
-   will issue both instructions in a single cycle on super-scalar
-   implementations.  This function identifies such pairs.  */
-
-int
-aarch_crypto_can_dual_issue (rtx_insn *producer_insn, rtx_insn *consumer_insn)
-{
-  rtx producer_set, consumer_set;
-  rtx producer_src, consumer_src;
-
-  producer_set = single_set (producer_insn);
-  consumer_set = single_set (consumer_insn);
-
-  producer_src = producer_set ? SET_SRC (producer_set) : NULL;
-  consumer_src = consumer_set ? SET_SRC (consumer_set) : NULL;
-
-  if (producer_src && consumer_src
-  && GET_CODE (producer_src) == UNSPEC && GET_CODE (consumer_src) == UNSPEC
-  && ((XINT (producer_src, 1) == UNSPEC_AESE
-   && XINT (consumer_src, 1) == UNSPEC_AESMC)
-  || (XINT (producer_src, 1) == UNSPEC_AESD
-  && XINT (consumer_src, 1) == UNSPEC_AESIMC)))
-  {
-unsigned int regno = REGNO (SET_DEST (producer_set));
-
-/* Before reload the registers are virtual, so the destination of
-   consumer_set doesn't need to match.  */
-
-return (REGNO (SET_DEST (consumer_set)) == regno || !reload_completed)
-   && REGNO (XVECEXP (consumer_src, 0, 0)) == regno;
-  }
-
-  return 0;
-}
-
 /* Return TRUE if X is either an arithmetic shift left, or
is a multiplication by a power of two.  */
 bool
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
e9aba65c70563f23ba3049702072a59cf555b9ce..5c5129a8e52adb07bb431eb51c6f6239b9b0c941
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -30565,10 +30565,6 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* 
curr)
   if (!arm_macro_fusion_p (

[patch 1/2][aarch64]: redefine aes patterns

2019-07-05 Thread Sylvia Taylor
Greetings,

This first patch removes aarch64 usage of the aese/aesmc and aesd/aesimc
fusions (i.e. aes fusion) implemented in the scheduler due to unpredictable
behaviour observed in cases such as:
- when register allocation goes bad (e.g. extra movs)
- aes operations with xor and zeroed keys among interleaved operations

A more stable version should be provided by instead doing the aes fusion 
during the combine pass. Since the aese and aesd patterns have been 
rewritten as encapsulating a xor operation, the existing combine fusion 
patterns have also been updated. The purpose is to simplify the need of 
having additional combine patterns for cases like the ones below:

For AESE (though it also applies to AESD as both have a xor operation):

data = data ^ key;
data = vaeseq_u8(data, zero);
---
eor v1.16b, v0.16b, v1.16b
aesev1.16b, v2.16b

Should mean and generate the same as:

data = vaeseq_u8(data, key);
---
aesev1.16b, v0.16b

Bootstrapped and tested on aarch64-none-linux-gnu.

Cheers,
Syl

gcc/ChangeLog:

2019-07-05  Sylvia Taylor  

* config/aarch64/aarch64-simd.md
(aarch64_crypto_aesv16qi): Redefine pattern with xor.
(aarch64_crypto_aesv16qi): Remove attribute enabled.
(*aarch64_crypto_aesv16qi_xor_combine): Remove both.
(*aarch64_crypto_aese_fused,
*aarch64_crypto_aesd_fused): Update to new definition.
* config/aarch64/aarch64.c
(aarch_macro_fusion_pair_p): Remove aese/aesmc fusion check.

gcc/testsuite/ChangeLog:

2019-07-05  Sylvia Taylor  

* gcc.target/aarch64/crypto-fuse-1.c: Remove.
* gcc.target/aarch64/crypto-fuse-2.c: Remove.
* gcc.target/aarch64/aes-fuse-1.c: New testcase.
* gcc.target/aarch64/aes-fuse-2.c: New testcase.
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
83f5c1fc2c27b265d528e9d5a02c05cc7fe5001f..1bcc50081f50a89f4951b15d7c465e03d8d9fb81
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -6053,56 +6053,23 @@
 
 (define_insn "aarch64_crypto_aesv16qi"
   [(set (match_operand:V16QI 0 "register_operand" "=w")
-   (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "%0")
-  (match_operand:V16QI 2 "register_operand" "w")]
+   (unspec:V16QI
+   [(xor:V16QI
+(match_operand:V16QI 1 "register_operand" "%0")
+(match_operand:V16QI 2 "register_operand" "w"))]
  CRYPTO_AES))]
   "TARGET_SIMD && TARGET_AES"
   "aes\\t%0.16b, %2.16b"
   [(set_attr "type" "crypto_aese")]
 )
 
-(define_insn "*aarch64_crypto_aesv16qi_xor_combine"
-  [(set (match_operand:V16QI 0 "register_operand" "=w")
-   (unspec:V16QI [(xor:V16QI
-   (match_operand:V16QI 1 "register_operand" "%0")
-   (match_operand:V16QI 2 "register_operand" "w"))
-  (match_operand:V16QI 3 "aarch64_simd_imm_zero" "")]
-  CRYPTO_AES))]
-  "TARGET_SIMD && TARGET_AES"
-  "aes\\t%0.16b, %2.16b"
-  [(set_attr "type" "crypto_aese")]
-)
-
-(define_insn "*aarch64_crypto_aesv16qi_xor_combine"
-  [(set (match_operand:V16QI 0 "register_operand" "=w")
-   (unspec:V16QI [(match_operand:V16QI 3 "aarch64_simd_imm_zero" "")
-   (xor:V16QI (match_operand:V16QI 1 "register_operand" "%0")
-  (match_operand:V16QI 2 "register_operand" "w"))]
-   CRYPTO_AES))]
-  "TARGET_SIMD && TARGET_AES"
-  "aes\\t%0.16b, %2.16b"
-  [(set_attr "type" "crypto_aese")]
-)
-
-;; When AES/AESMC fusion is enabled we want the register allocation to
-;; look like:
-;;AESE Vn, _
-;;AESMC Vn, Vn
-;; So prefer to tie operand 1 to operand 0 when fusing.
-
 (define_insn "aarch64_crypto_aesv16qi"
-  [(set (match_operand:V16QI 0 "register_operand" "=w,w")
-   (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,w")]
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+   (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "w")]
 CRYPTO_AESMC))]
   "TARGET_SIMD && TARGET_AES"
   "aes\\t%0.16b, %1.16b"
-  [(set_attr "type" "crypto_aesmc")
-   (set_attr_alternative "enabled"
- [(if_then_else (match_test
-  "aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)")
-(const_string "yes" 

[patch2/2][arm]: remove builtin expand for sha1

2019-07-03 Thread Sylvia Taylor
Greetings,

This patch removes the builtin expand handling for sha1h/c/m/p and
replaces it with expand patterns. This should make it more consistent
with how we handle intrinsic implementations and cleans up the custom
sha1 code in the arm_expand builtins for unop and ternop.

Bootstrapped and tested on arm-none-linux-gnueabihf.

Cheers,
Syl

gcc/ChangeLog:

2019-07-03  Sylvia Taylor  

* config/arm/arm-builtins.c
(arm_expand_ternop_builtin): Remove builtin_sha1cpm.
(arm_expand_unop_builtin): Remove builtin_sha1h.
* config/arm/crypto.md
(crypto_sha1h): New expand pattern.
(crypto_sha1c): Likewise.
(crypto_sha1m): Likewise.
(crypto_sha1p): Likewise.
(crypto_sha1h_lb): Modify.
(crypto_sha1c_lb): Likewise.
(crypto_sha1m_lb): Likewise.
(crypto_sha1p_lb): Likewise.
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 
f646ab537fcdac54a3eaf0f1fa403698e29ef005..4702a4078d1f9fd766a5efccbfdc58e2b927133c
 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -1993,25 +1993,12 @@ arm_expand_ternop_builtin (enum insn_code icode,
   rtx op0 = expand_normal (arg0);
   rtx op1 = expand_normal (arg1);
   rtx op2 = expand_normal (arg2);
-  rtx op3 = NULL_RTX;
 
-  /* The sha1c, sha1p, sha1m crypto builtins require a different vec_select
- lane operand depending on endianness.  */
-  bool builtin_sha1cpm_p = false;
-
-  if (insn_data[icode].n_operands == 5)
-{
-  gcc_assert (icode == CODE_FOR_crypto_sha1c
-  || icode == CODE_FOR_crypto_sha1p
-  || icode == CODE_FOR_crypto_sha1m);
-  builtin_sha1cpm_p = true;
-}
   machine_mode tmode = insn_data[icode].operand[0].mode;
   machine_mode mode0 = insn_data[icode].operand[1].mode;
   machine_mode mode1 = insn_data[icode].operand[2].mode;
   machine_mode mode2 = insn_data[icode].operand[3].mode;
 
-
   if (VECTOR_MODE_P (mode0))
 op0 = safe_vector_operand (op0, mode0);
   if (VECTOR_MODE_P (mode1))
@@ -2034,13 +2021,8 @@ arm_expand_ternop_builtin (enum insn_code icode,
 op1 = copy_to_mode_reg (mode1, op1);
   if (! (*insn_data[icode].operand[3].predicate) (op2, mode2))
 op2 = copy_to_mode_reg (mode2, op2);
-  if (builtin_sha1cpm_p)
-op3 = GEN_INT (TARGET_BIG_END ? 1 : 0);
 
-  if (builtin_sha1cpm_p)
-pat = GEN_FCN (icode) (target, op0, op1, op2, op3);
-  else
-pat = GEN_FCN (icode) (target, op0, op1, op2);
+  pat = GEN_FCN (icode) (target, op0, op1, op2);
   if (! pat)
 return 0;
   emit_insn (pat);
@@ -2096,16 +2078,8 @@ arm_expand_unop_builtin (enum insn_code icode,
   rtx pat;
   tree arg0 = CALL_EXPR_ARG (exp, 0);
   rtx op0 = expand_normal (arg0);
-  rtx op1 = NULL_RTX;
   machine_mode tmode = insn_data[icode].operand[0].mode;
   machine_mode mode0 = insn_data[icode].operand[1].mode;
-  bool builtin_sha1h_p = false;
-
-  if (insn_data[icode].n_operands == 3)
-{
-  gcc_assert (icode == CODE_FOR_crypto_sha1h);
-  builtin_sha1h_p = true;
-}
 
   if (! target
   || GET_MODE (target) != tmode
@@ -2121,13 +2095,9 @@ arm_expand_unop_builtin (enum insn_code icode,
   if (! (*insn_data[icode].operand[1].predicate) (op0, mode0))
op0 = copy_to_mode_reg (mode0, op0);
 }
-  if (builtin_sha1h_p)
-op1 = GEN_INT (TARGET_BIG_END ? 1 : 0);
 
-  if (builtin_sha1h_p)
-pat = GEN_FCN (icode) (target, op0, op1);
-  else
-pat = GEN_FCN (icode) (target, op0);
+  pat = GEN_FCN (icode) (target, op0);
+
   if (! pat)
 return 0;
   emit_insn (pat);
diff --git a/gcc/config/arm/crypto.md b/gcc/config/arm/crypto.md
index 
d1ae76800d94a5a9e06e109dc8dc0328166dcfdc..fc43a7862f9886f4249235f5836006c51fce7340
 100644
--- a/gcc/config/arm/crypto.md
+++ b/gcc/config/arm/crypto.md
@@ -66,13 +66,23 @@
of the V4SI, adjusted for endianness. Required due to neon_vget_lane and
neon_set_lane that change the element ordering in memory for big-endian.  */
 
-(define_insn "crypto_sha1h"
+(define_expand "crypto_sha1h"
+  [(set (match_operand:V4SI 0 "register_operand")
+   (match_operand:V4SI 1 "register_operand"))]
+  "TARGET_CRYPTO"
+{
+  rtx op2 = GEN_INT (NEON_ENDIAN_LANE_N (V2SImode, 0));
+  emit_insn (gen_crypto_sha1h_lb (operands[0], operands[1], op2));
+  DONE;
+})
+
+(define_insn "crypto_sha1h_lb"
   [(set (match_operand:V4SI 0 "register_operand" "=w")
-  (unspec:V4SI
- [(vec_select:SI
-   (match_operand:V4SI 1 "register_operand" "w")
-   (parallel [(match_operand:SI 2 "immediate_operand" "i")]))]
-  UNSPEC_SHA1H))]
+   (unspec:V4SI
+ [(vec_select:SI
+  (match_operand:V4SI 1 "register_operand" "w")
+  (parallel [(match_operand:SI 2 "immediate_operand" "i")]))]
+   UNSPEC_SHA1H))]
  

[patch1/2][arm][PR90317]: fix sha1 patterns

2019-07-03 Thread Sylvia Taylor
Greetings,

This patch fixes:

1) Ice message thrown when using the crypto_sha1h intrinsic due to
incompatible mode used for zero_extend. Removed zero extend as it is
not a good choice for vector modes and using an equivalent single
mode like TI (128bits) instead of V4SI produces extra instructions
making it inefficient.

This affects gcc version 8 and above.

2) Incorrect combine optimizations made due to vec_select usage
in the sha1 patterns on arm. The patterns should only combine
a vec select within a sha1h instruction when the lane is 0.

This affects gcc version 5 and above.

- Fixed by explicitly declaring the valid const int for such
optimizations. For cases when the lane is not 0, the vector
lane selection now occurs in a e.g. vmov instruction prior 
to sha1h.

- Updated the sha1h testcases on arm to check for additional
cases with custom vector lane selection.

The intrinsic functions for the sha1 patterns have also been
simplified which seems to eliminate extra vmovs like:
- vmov.i32 q8, #0.

Bootstrapped and tested on arm-none-linux-gnueabihf.

Cheers,
Syl

gcc/ChangeLog:

2019-07-03  Sylvia Taylor  

PR target/90317
* config/arm/arm_neon.h
(vsha1h_u32): Refactor.
(vsha1cq_u32): Likewise.
(vsha1pq_u32): Likewise.
(vsha1mq_u32): Likewise.
* config/arm/crypto.md:
(crypto_sha1h): Remove zero extend, correct vec select.
(crypto_sha1c): Correct vec select.
(crypto_sha1m): Likewise.
(crypto_sha1p): Likewise.

gcc/testsuite/ChangeLog:

2019-07-03  Sylvia Taylor  

PR target/90317
* gcc.target/arm/crypto-vsha1cq_u32.c (foo): Change.
(GET_LANE, TEST_SHA1C_VEC_SELECT): New.
* gcc.target/arm/crypto-vsha1h_u32.c (foo): Change.
(GET_LANE, TEST_SHA1H_VEC_SELECT): New.
* gcc.target/arm/crypto-vsha1mq_u32.c (foo): Change.
(GET_LANE, TEST_SHA1M_VEC_SELECT): New.
* gcc.target/arm/crypto-vsha1pq_u32.c (foo): Change.
(GET_LANE, TEST_SHA1P_VEC_SELECT): New.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 
6b982392ece69bb245ffd3bdc34d09c6f01745eb..1f200d491d1de3993bc3a682d586da137958ff6b
 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -16938,37 +16938,32 @@ __extension__ extern __inline uint32_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha1h_u32 (uint32_t __hash_e)
 {
-  uint32x4_t __t = vdupq_n_u32 (0);
-  __t = vsetq_lane_u32 (__hash_e, __t, 0);
-  __t = __builtin_arm_crypto_sha1h (__t);
-  return vgetq_lane_u32 (__t, 0);
+  return vgetq_lane_u32 (__builtin_arm_crypto_sha1h (vdupq_n_u32 (__hash_e)),
+0);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
 {
-  uint32x4_t __t = vdupq_n_u32 (0);
-  __t = vsetq_lane_u32 (__hash_e, __t, 0);
-  return __builtin_arm_crypto_sha1c (__hash_abcd, __t, __wk);
+  return __builtin_arm_crypto_sha1c (__hash_abcd, vdupq_n_u32 (__hash_e),
+__wk);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
 {
-  uint32x4_t __t = vdupq_n_u32 (0);
-  __t = vsetq_lane_u32 (__hash_e, __t, 0);
-  return __builtin_arm_crypto_sha1p (__hash_abcd, __t, __wk);
+  return __builtin_arm_crypto_sha1p (__hash_abcd, vdupq_n_u32 (__hash_e),
+__wk);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
 {
-  uint32x4_t __t = vdupq_n_u32 (0);
-  __t = vsetq_lane_u32 (__hash_e, __t, 0);
-  return __builtin_arm_crypto_sha1m (__hash_abcd, __t, __wk);
+  return __builtin_arm_crypto_sha1m (__hash_abcd,  vdupq_n_u32 (__hash_e),
+__wk);
 }
 
 __extension__ extern __inline uint32x4_t
diff --git a/gcc/config/arm/crypto.md b/gcc/config/arm/crypto.md
index 
63d9d9ffa424fa51b05ebee5138b2c7c0f304745..30ab1dbeb1205129c532a1a7f1763cf140440595
 100644
--- a/gcc/config/arm/crypto.md
+++ b/gcc/config/arm/crypto.md
@@ -62,14 +62,18 @@
   [(set_attr "type" "")]
 )
 
+/* The vec_select operation always selects index 0 from the lower V2SI subreg
+   of the V4SI, adjusted for endianness. Required due to neon_vget_lane and
+   neon_set_lane that change the element ordering in memory for big-endian.  */
+
 (define_insn "crypto_sha1h"
   [(set (match_operand:V4SI 0 "register_operand" "=w")
-(zero_extend:V4SI
-  (unspec:SI [(vec_select:SI
-(match_operand:V4SI 1 "register_operand" "w")
-(parallel [(match_operand:SI 2 "

[patch][aarch64]: fix frame pointer setup before tlsdesc call

2019-06-25 Thread Sylvia Taylor
Greetings,

This patch fixes a bug with TLS in which the frame pointer is not
established until after the tlsdesc call, thus not conforming to
the aarch64 procedure call standard.

Changed the tlsdesc instruction patterns to set a dependency on the
x29 frame pointer. This helps the instruction scheduler to arrange
the tlsdesc call after the frame pointer is set.

Example of frame pointer (x29) set incorrectly after tlsdesc call:

stp x29, x30, [sp, -16]!
adrpx0, :tlsdesc:.LANCHOR0
ldr x2, [x0, #:tlsdesc_lo12:.LANCHOR0]
add x0, x0, :tlsdesc_lo12:.LANCHOR0
.tlsdesccall.LANCHOR0
blr x2
...
mov x29, sp
...

After introducing dependency on x29, the scheduler does the frame
pointer setup before tlsdesc:

stp x29, x30, [sp, -16]!
mov x29, sp
adrpx0, :tlsdesc:.LANCHOR0
ldr x2, [x0, #:tlsdesc_lo12:.LANCHOR0]
add x0, x0, :tlsdesc_lo12:.LANCHOR0
.tlsdesccall.LANCHOR0
blr x2
...

Testcase used with -O2 -fpic:

void foo()
{
  static __thread int x = 0;
  bar (&x);
}

I am not sure what would classify as an effective check for this
testcase. The only idea I received so far would be to write a regexp
inside a scan-assembler-not that would potentially look for this pattern:


.tlsdesccall 
blr 

[mov x29, sp] OR [add x29, sp, 0]


(similar to what was attempted in gcc/testsuite/gcc.target/arm/pr85434.c)

I would like maintainers' input on whether such a testcase should be added
and if there are better ways of checking for the instruction order.

Bootstrapped and tested on aarch64-none-linux-gnu.

Ok for trunk? If yes, I don't have any commit rights, so can someone please
commit it on my behalf.

Cheers,
Syl

gcc/ChangeLog:

2019-06-25  Sylvia Taylor  

* config/aarch64/aarch64.md
(tlsdesc_small_advsimd_): Update.
(tlsdesc_small_sve_): Likewise.
(FP_REGNUM): New.
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
ff83974aeb0b1bf46415c29ba47ada74a79d7586..099cad54336ccaf2b658fbe9fd7a4a84b3abc6e0
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -120,6 +120,7 @@
 ;; Scratch registers used in frame layout.
 (IP0_REGNUM 16)
 (IP1_REGNUM 17)
+(FP_REGNUM 29)
 (LR_REGNUM  30)
   ]
 )
@@ -6617,7 +6618,8 @@
UNSPEC_TLSDESC))
(clobber (reg:DI LR_REGNUM))
(clobber (reg:CC CC_REGNUM))
-   (clobber (match_scratch:DI 1 "=r"))]
+   (clobber (match_scratch:DI 1 "=r"))
+   (use (reg:DI FP_REGNUM))]
   "TARGET_TLS_DESC && !TARGET_SVE"
   "adrp\\tx0, %A0\;ldr\\t%1, [x0, #%L0]\;add\\t0, 0, 
%L0\;.tlsdesccall\\t%0\;blr\\t%1"
   [(set_attr "type" "call")
@@ -6680,7 +6682,8 @@
(clobber (reg:VNx2BI P13_REGNUM))
(clobber (reg:VNx2BI P14_REGNUM))
(clobber (reg:VNx2BI P15_REGNUM))
-   (clobber (match_scratch:DI 1 "=r"))]
+   (clobber (match_scratch:DI 1 "=r"))
+   (use (reg:DI FP_REGNUM))]
   "TARGET_TLS_DESC && TARGET_SVE"
   "adrp\\tx0, %A0\;ldr\\t%1, [x0, #%L0]\;add\\t0, 0, 
%L0\;.tlsdesccall\\t%0\;blr\\t%1"
   [(set_attr "type" "call")


RE: [patch][aarch64]: fix unrecognizable insn for ldr got in ilp32 tiny

2019-06-18 Thread Sylvia Taylor
Hi Wilco,

Combined them into one pattern. Updated the diff and the changelog is now:

gcc/ChangeLog:

2019-06-18  Sylvia Taylor  

* config/aarch64/aarch64.c
(aarch64_load_symref_appropriately): Change SYMBOL_TINY_GOT.
* config/aarch64/aarch64.md
(ldr_got_tiny_): New pattern.
(ldr_got_tiny_sidi): New pattern.

Cheers,
Syl

-Original Message-
From: Wilco Dijkstra  
Sent: 13 June 2019 18:42
To: Sylvia Taylor 
Cc: nd ; GCC Patches ; Richard Earnshaw 
; James Greenhalgh 
Subject: Re: [patch][aarch64]: fix unrecognizable insn for ldr got in ilp32 tiny

Hi Sylvia,

-(define_insn "ldr_got_tiny"
+(define_insn "ldr_got_tiny_di"
   [(set (match_operand:DI 0 "register_operand" "=r")
-   (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "S")]
-  UNSPEC_GOTTINYPIC))]
+   (unspec:DI
+ [(match_operand:DI 1 "aarch64_valid_symref" "S")]
+   UNSPEC_GOTTINYPIC))]
   ""
   "ldr\\t%0, %L1"
   [(set_attr "type" "load_8")]
 )
 
+(define_insn "ldr_got_tiny_si"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+   (unspec:SI
+ [(match_operand:SI 1 "aarch64_valid_symref" "S")]
+   UNSPEC_GOTTINYPIC))]
+  "TARGET_ILP32"
+  "ldr\\t%0, %L1"
+  [(set_attr "type" "load_4")]
+)

These can be easily combined like the related ldr_got_small_.

Wilco

-Original Message-
From: Sylvia Taylor 
Sent: 11 June 2019 14:25
To: Richard Earnshaw ; James Greenhalgh 
; Marcus Shawcroft ; 
gcc-patches@gcc.gnu.org
Cc: nd 
Subject: [patch][aarch64]: fix unrecognizable insn for ldr got in ilp32 tiny

Greetings,

This patch addresses a bug in ldr GOT for mcmodel=tiny in which this 
instruction is not generated for ilp32 modes.

Defined 2 new patterns for ldr_got_tiny. Added additional checks to use the 
appropriate rtl pattern for any of the modes.

Examples of previously unrecognized instructions:
ldrx1, :got:_ZTIi// [c=4 l=4]  ldr_got_tiny_si
ldrx0, :got:global   // [c=4 l=4]  ldr_got_tiny_sidi

Bootstrapped and tested on aarch64-none-linux-gnu.
Bug fix tested with aarch64-none-elf-g++ -mabi=ilp32 -mcmodel=tiny -fpic.

The existing test now fixed is: testsuite/g++.dg/torture/stackalign/throw-1.C

Ok for trunk? If yes, I don't have any commit rights, so can someone please 
commit it on my behalf.

Cheers,
Syl

gcc/ChangeLog:

2019-06-11  Sylvia Taylor  

* config/aarch64/aarch64.c
(aarch64_load_symref_appropriately): Change SYMBOL_TINY_GOT.
* config/aarch64/aarch64.md
(ldr_got_tiny): Change to ldr_got_tiny_di.
(ldr_got_tiny_si): New pattern.
(ldr_got_tiny_sidi): New pattern.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
b38505b0872688634b2d3f625ab8d313e89cfca0..26a8f91b4af53eb2301f27f82a164174c6ef7774
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2251,8 +2251,26 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
   }
 
 case SYMBOL_TINY_GOT:
-  emit_insn (gen_ldr_got_tiny (dest, imm));
-  return;
+  {
+   machine_mode mode = GET_MODE (dest);
+
+   if (mode == ptr_mode)
+ {
+   if (mode == DImode)
+ emit_insn (gen_ldr_got_tiny_di (dest, imm));
+   else
+ /* TARGET_ILP32.  */
+ emit_insn (gen_ldr_got_tiny_si (dest, imm));
+ }
+   else
+ {
+   /* TARGET_ILP32.  */
+   gcc_assert (mode == Pmode);
+   emit_insn (gen_ldr_got_tiny_sidi (dest, imm));
+ }
+
+   return;
+  }
 
 case SYMBOL_TINY_TLSIE:
   {
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
ff83974aeb0b1bf46415c29ba47ada74a79d7586..34a1c52777ed2533dc7f08491f5852138c0e1d00
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6469,13 +6469,25 @@
   [(set_attr "type" "load_4")]
 )
 
-(define_insn "ldr_got_tiny"
-  [(set (match_operand:DI 0 "register_operand" "=r")
-   (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "S")]
-  UNSPEC_GOTTINYPIC))]
+(define_insn "ldr_got_tiny_"
+  [(set (match_operand:PTR 0 "register_operand" "=r")
+   (unspec:PTR
+ [(match_operand:PTR 1 "aarch64_valid_symref" "S")]
+   UNSPEC_GOTTINYPIC))]
   ""
   "ldr\\t%0, %L1"
-  [(set_attr "type" "load_8")]
+  [(set_attr "type" "load_")]
+)
+
+(define_insn "ldr_got_tiny_sidi"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (zero_extend:DI
+ (unspec:SI
+   [(match_operand:DI 1 "aarch64_valid_symref" "S")]
+ UNSPEC_GOTTINYPIC)))]
+  "TARGET_ILP32"
+  "ldr\\t%0, %L1"
+  [(set_attr "type" "load_4")]
 )
 
 (define_insn "aarch64_load_tp_hard"


RE: [patch][aarch64]: add usra and ssra combine patterns

2019-06-17 Thread Sylvia Taylor
Updating patch with missing scan-assembler checks.

Cheers,
Syl

-Original Message-
From: Sylvia Taylor 
Sent: 04 June 2019 12:24
To: James Greenhalgh 
Cc: Richard Earnshaw ; Marcus Shawcroft 
; gcc-patches@gcc.gnu.org; nd 
Subject: RE: [patch][aarch64]: add usra and ssra combine patterns

Hi James,

I've managed to remove the odd redundant git diff change.

Regarding aarch64_sra_n, this patch shouldn't affect it.

I am also not aware of any way of enabling this combine inside the pattern used 
for those intrinsics, so I kept them separate.

Cheers,
Syl

-Original Message-
From: James Greenhalgh 
Sent: 03 June 2019 11:20
To: Sylvia Taylor 
Cc: Richard Earnshaw ; Marcus Shawcroft 
; gcc-patches@gcc.gnu.org; nd 
Subject: Re: [patch][aarch64]: add usra and ssra combine patterns

On Thu, May 30, 2019 at 03:25:19PM +0100, Sylvia Taylor wrote:
> Greetings,
> 
> This patch adds support to combine:
> 
> 1) ushr and add into usra, example:
> 
> ushr  v0.16b, v0.16b, 2
> add   v0.16b, v0.16b, v2.16b
> ---
> usra  v2.16b, v0.16b, 2
> 
> 2) sshr and add into ssra, example:
> 
> sshr  v1.16b, v1.16b, 2
> add   v1.16b, v1.16b, v3.16b
> ---
> ssra  v3.16b, v1.16b, 2
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk? If yes, I don't have any commit rights, so can someone 
> please commit it on my behalf.

This patch has an unrelated change to
aarch64_get_lane_zero_extend Please revert that and 
resend.

What changes (if any) should we make to aarch64_sra_n based on this 
patch, and to the vsra_n intrinsics in arm_neon.h ?

Thanks,
James

> 
> Cheers,
> Syl
> 
> gcc/ChangeLog:
> 
> 2019-05-30  Sylvia Taylor  
> 
>   * config/aarch64/aarch64-simd.md
>   (*aarch64_simd_sra): New.
>   * config/aarch64/iterators.md
>   (SHIFTRT): New iterator.
>   (sra_op): New attribute.
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-05-30  Sylvia Taylor  
> 
>   * gcc.target/aarch64/simd/ssra.c: New test.
>   * gcc.target/aarch64/simd/usra.c: New test.

> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index
> e3852c5d182b70978d7603225fce55c0b8ee2894..502ac5f3b45a1da059bb07701150
> a531091378ed 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3110,22 +3122,22 @@
>  operands[2] = aarch64_endian_lane_rtx (mode, INTVAL (operands[2]));
>  return "smov\\t%0, %1.[%2]";
>}
> -  [(set_attr "type" "neon_to_gp")]
> -)
> -
> -(define_insn "*aarch64_get_lane_zero_extend"
> -  [(set (match_operand:GPI 0 "register_operand" "=r")
> - (zero_extend:GPI
> -   (vec_select:
> - (match_operand:VDQQH 1 "register_operand" "w")
> - (parallel [(match_operand:SI 2 "immediate_operand" "i")]]
> -  "TARGET_SIMD"
> -  {
> -operands[2] = aarch64_endian_lane_rtx (mode,
> -INTVAL (operands[2]));
> -return "umov\\t%w0, %1.[%2]";
> -  }
> -  [(set_attr "type" "neon_to_gp")]
> +  [(set_attr "type" "neon_to_gp")]
> +)
> +
> +(define_insn "*aarch64_get_lane_zero_extend"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> + (zero_extend:GPI
> +   (vec_select:
> + (match_operand:VDQQH 1 "register_operand" "w")
> + (parallel [(match_operand:SI 2 "immediate_operand" "i")]]
> +  "TARGET_SIMD"
> +  {
> +operands[2] = aarch64_endian_lane_rtx (mode,
> +INTVAL (operands[2]));
> +return "umov\\t%w0, %1.[%2]";
> +  }
> +  [(set_attr "type" "neon_to_gp")]
>  )
>  
>  ;; Lane extraction of a value, neither sign nor zero extension

These changes should be dropped.


diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
eeed08e71ca0b96726cb28743ef38487a8287600..aba6af24eee1c29fe4524eb352747c94617b30c7
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -986,6 +986,18 @@
   [(set_attr "type" "neon_shift_imm")]
 )
 
+(define_insn "*aarch64_simd_sra"
+ [(set (match_operand:VDQ_I 0 "register_operand" "=w")
+   (plus:VDQ_I
+  (SHIFTRT:VDQ_I
+   (match_operand:VDQ_I 1 "register_operand" "w")
+   (match_operand:VDQ_I 2 "aarch64_simd_rshift_imm" "Dr"))
+  (match_operand:VDQ_I 3 "register_operand" "0")

[patch][aarch64]: fix unrecognizable insn for ldr got in ilp32 tiny

2019-06-11 Thread Sylvia Taylor
Greetings,

This patch addresses a bug in ldr GOT for mcmodel=tiny in
which this instruction is not generated for ilp32 modes.

Defined 2 new patterns for ldr_got_tiny. Added additional
checks to use the appropriate rtl pattern for any of the modes.

Examples of previously unrecognized instructions:
ldrx1, :got:_ZTIi// [c=4 l=4]  ldr_got_tiny_si
ldrx0, :got:global   // [c=4 l=4]  ldr_got_tiny_sidi

Bootstrapped and tested on aarch64-none-linux-gnu.
Bug fix tested with aarch64-none-elf-g++ -mabi=ilp32 -mcmodel=tiny -fpic.

The existing test now fixed is: testsuite/g++.dg/torture/stackalign/throw-1.C

Ok for trunk? If yes, I don't have any commit rights,
so can someone please commit it on my behalf.

Cheers,
Syl

gcc/ChangeLog:

2019-06-11  Sylvia Taylor  

* config/aarch64/aarch64.c
(aarch64_load_symref_appropriately): Change SYMBOL_TINY_GOT.
* config/aarch64/aarch64.md
(ldr_got_tiny): Change to ldr_got_tiny_di.
(ldr_got_tiny_si): New pattern.
(ldr_got_tiny_sidi): New pattern.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
b38505b0872688634b2d3f625ab8d313e89cfca0..26a8f91b4af53eb2301f27f82a164174c6ef7774
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2251,8 +2251,26 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
   }
 
 case SYMBOL_TINY_GOT:
-  emit_insn (gen_ldr_got_tiny (dest, imm));
-  return;
+  {
+   machine_mode mode = GET_MODE (dest);
+
+   if (mode == ptr_mode)
+ {
+   if (mode == DImode)
+ emit_insn (gen_ldr_got_tiny_di (dest, imm));
+   else
+ /* TARGET_ILP32.  */
+ emit_insn (gen_ldr_got_tiny_si (dest, imm));
+ }
+   else
+ {
+   /* TARGET_ILP32.  */
+   gcc_assert (mode == Pmode);
+   emit_insn (gen_ldr_got_tiny_sidi (dest, imm));
+ }
+
+   return;
+  }
 
 case SYMBOL_TINY_TLSIE:
   {
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
ff83974aeb0b1bf46415c29ba47ada74a79d7586..d594a81ed42bf2e5af4c5db659eb43c2b33ccad7
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6469,15 +6469,37 @@
   [(set_attr "type" "load_4")]
 )
 
-(define_insn "ldr_got_tiny"
+(define_insn "ldr_got_tiny_di"
   [(set (match_operand:DI 0 "register_operand" "=r")
-   (unspec:DI [(match_operand:DI 1 "aarch64_valid_symref" "S")]
-  UNSPEC_GOTTINYPIC))]
+   (unspec:DI
+ [(match_operand:DI 1 "aarch64_valid_symref" "S")]
+   UNSPEC_GOTTINYPIC))]
   ""
   "ldr\\t%0, %L1"
   [(set_attr "type" "load_8")]
 )
 
+(define_insn "ldr_got_tiny_si"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+   (unspec:SI
+ [(match_operand:SI 1 "aarch64_valid_symref" "S")]
+   UNSPEC_GOTTINYPIC))]
+  "TARGET_ILP32"
+  "ldr\\t%0, %L1"
+  [(set_attr "type" "load_4")]
+)
+
+(define_insn "ldr_got_tiny_sidi"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (zero_extend:DI
+ (unspec:SI
+   [(match_operand:DI 1 "aarch64_valid_symref" "S")]
+ UNSPEC_GOTTINYPIC)))]
+  "TARGET_ILP32"
+  "ldr\\t%0, %L1"
+  [(set_attr "type" "load_4")]
+)
+
 (define_insn "aarch64_load_tp_hard"
   [(set (match_operand:DI 0 "register_operand" "=r")
(unspec:DI [(const_int 0)] UNSPEC_TLS))]


[patch][aarch64]: add intrinsics for vld1(q)_x4 and vst1(q)_x4

2019-06-10 Thread Sylvia Taylor
Greetings,

This patch adds the intrinsic functions for:
- vld1__x4
- vst1__x4
- vld1q__x4
- vst1q__x4

Bootstrapped and tested on aarch64-none-linux-gnu.

Ok for trunk? If yes, I don't have any commit rights, so can someone 
please commit it on my behalf.

Cheers,
Syl

gcc/ChangeLog:

2019-06-10  Sylvia Taylor  

* config/aarch64/aarch64-simd-builtins.def:
(ld1x4): New.
(st1x4): Likewise.
* config/aarch64/aarch64-simd.md:
(aarch64_ld1x4): New pattern.
(aarch64_st1x4): Likewise.
(aarch64_ld1_x4_): Likewise.
(aarch64_st1_x4_): Likewise.
* config/aarch64/arm_neon.h:
(vld1_s8_x4): New function.
(vld1q_s8_x4): Likewise.
(vld1_s16_x4): Likewise.
(vld1q_s16_x4): Likewise.
(vld1_s32_x4): Likewise.
(vld1q_s32_x4): Likewise.
(vld1_u8_x4): Likewise.
(vld1q_u8_x4): Likewise.
(vld1_u16_x4): Likewise.
(vld1q_u16_x4): Likewise.
(vld1_u32_x4): Likewise.
(vld1q_u32_x4): Likewise.
(vld1_f16_x4): Likewise.
(vld1q_f16_x4): Likewise.
(vld1_f32_x4): Likewise.
(vld1q_f32_x4): Likewise.
(vld1_p8_x4): Likewise.
(vld1q_p8_x4): Likewise.
(vld1_p16_x4): Likewise.
(vld1q_p16_x4): Likewise.
(vld1_s64_x4): Likewise.
(vld1_u64_x4): Likewise.
(vld1_p64_x4): Likewise.
(vld1q_s64_x4): Likewise.
(vld1q_u64_x4): Likewise.
(vld1q_p64_x4): Likewise.
(vld1_f64_x4): Likewise.
(vld1q_f64_x4): Likewise.
(vst1_s8_x4): Likewise.
(vst1q_s8_x4): Likewise.
(vst1_s16_x4): Likewise.
(vst1q_s16_x4): Likewise.
(vst1_s32_x4): Likewise.
(vst1q_s32_x4): Likewise.
(vst1_u8_x4): Likewise.
(vst1q_u8_x4): Likewise.
(vst1_u16_x4): Likewise.
(vst1q_u16_x4): Likewise.
(vst1_u32_x4): Likewise.
(vst1q_u32_x4): Likewise.
(vst1_f16_x4): Likewise.
(vst1q_f16_x4): Likewise.
(vst1_f32_x4): Likewise.
(vst1q_f32_x4): Likewise.
(vst1_p8_x4): Likewise.
(vst1q_p8_x4): Likewise.
(vst1_p16_x4): Likewise.
(vst1q_p16_x4): Likewise.
(vst1_s64_x4): Likewise.
(vst1_u64_x4): Likewise.
(vst1_p64_x4): Likewise.
(vst1q_s64_x4): Likewise.
(vst1q_u64_x4): Likewise.
(vst1q_p64_x4): Likewise.
(vst1_f64_x4): Likewise.
(vst1q_f64_x4): Likewise.

gcc/testsuite/ChangeLog:

2019-06-10  Sylvia Taylor  

* gcc.target/aarch64/advsimd-intrinsics/vld1x4.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/vst1x4.c: New test.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
b/gcc/config/aarch64/aarch64-simd-builtins.def
index 
17bb0c4869b12ede2fc51a8f89d841ded8fac230..f6c096b9186448972f440a70d8ac396a9aeaf004
 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -465,12 +465,18 @@
   /* Implemented by aarch64_ld1x3.  */
   BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
 
+  /* Implemented by aarch64_ld1x4.  */
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x4, 0)
+
   /* Implemented by aarch64_st1x2.  */
   BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
 
   /* Implemented by aarch64_st1x3.  */
   BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
 
+  /* Implemented by aarch64_st1x4.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0)
+
   /* Implemented by fma4.  */
   BUILTIN_VHSDF (TERNOP, fma, 4)
   VAR1 (TERNOP, fma, 4, hf)
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
eeed08e71ca0b96726cb28743ef38487a8287600..f62d4df97b433214c0211dcc0877ec6424925d14
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5271,6 +5271,28 @@
   [(set_attr "type" "neon_load1_3reg")]
 )
 
+(define_expand "aarch64_ld1x4"
+  [(match_operand:XI 0 "register_operand" "=w")
+   (match_operand:DI 1 "register_operand" "r")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (XImode, operands[1]);
+  emit_insn (gen_aarch64_ld1_x4_ (operands[0], mem));
+  DONE;
+})
+
+(define_insn "aarch64_ld1_x4_"
+  [(set (match_operand:XI 0 "register_operand" "=w")
+   (unspec:XI
+ [(match_operand:XI 1 "aarch64_simd_struct_operand" "Utv")
+  (unspec:VALLDIF [(const_int 4)] UNSPEC_VSTRUCTDUMMY)]
+   UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0. - %V0.}, %1"
+  [(set_attr "type" "neon_load1_4reg")]
+)
+
 (define_expand "aarch64_st1x2"
   [(match_operand:DI 0 "register_operand" "")
(match_operand:OI 1 "register_operand" "")
@@ -5313,6 +5335,28 @@
   [(set_attr "type" "neon_store1_3reg"

RE: [patch][aarch64]: add usra and ssra combine patterns

2019-06-04 Thread Sylvia Taylor
Hi James,

I've managed to remove the odd redundant git diff change.

Regarding aarch64_sra_n, this patch shouldn't affect it.

I am also not aware of any way of enabling this combine inside the pattern used 
for those intrinsics, so I kept them separate.

Cheers,
Syl

-Original Message-
From: James Greenhalgh  
Sent: 03 June 2019 11:20
To: Sylvia Taylor 
Cc: Richard Earnshaw ; Marcus Shawcroft 
; gcc-patches@gcc.gnu.org; nd 
Subject: Re: [patch][aarch64]: add usra and ssra combine patterns

On Thu, May 30, 2019 at 03:25:19PM +0100, Sylvia Taylor wrote:
> Greetings,
> 
> This patch adds support to combine:
> 
> 1) ushr and add into usra, example:
> 
> ushr  v0.16b, v0.16b, 2
> add   v0.16b, v0.16b, v2.16b
> ---
> usra  v2.16b, v0.16b, 2
> 
> 2) sshr and add into ssra, example:
> 
> sshr  v1.16b, v1.16b, 2
> add   v1.16b, v1.16b, v3.16b
> ---
> ssra  v3.16b, v1.16b, 2
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk? If yes, I don't have any commit rights, so can someone 
> please commit it on my behalf.

This patch has an unrelated change to
aarch64_get_lane_zero_extend Please revert that and 
resend.

What changes (if any) should we make to aarch64_sra_n based on this 
patch, and to the vsra_n intrinsics in arm_neon.h ?

Thanks,
James

> 
> Cheers,
> Syl
> 
> gcc/ChangeLog:
> 
> 2019-05-30  Sylvia Taylor  
> 
>   * config/aarch64/aarch64-simd.md
>   (*aarch64_simd_sra): New.
>   * config/aarch64/iterators.md
>   (SHIFTRT): New iterator.
>   (sra_op): New attribute.
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-05-30  Sylvia Taylor  
> 
>   * gcc.target/aarch64/simd/ssra.c: New test.
>   * gcc.target/aarch64/simd/usra.c: New test.

> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> e3852c5d182b70978d7603225fce55c0b8ee2894..502ac5f3b45a1da059bb07701150
> a531091378ed 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3110,22 +3122,22 @@
>  operands[2] = aarch64_endian_lane_rtx (mode, INTVAL (operands[2]));
>  return "smov\\t%0, %1.[%2]";
>}
> -  [(set_attr "type" "neon_to_gp")]
> -)
> -
> -(define_insn "*aarch64_get_lane_zero_extend"
> -  [(set (match_operand:GPI 0 "register_operand" "=r")
> - (zero_extend:GPI
> -   (vec_select:
> - (match_operand:VDQQH 1 "register_operand" "w")
> - (parallel [(match_operand:SI 2 "immediate_operand" "i")]]
> -  "TARGET_SIMD"
> -  {
> -operands[2] = aarch64_endian_lane_rtx (mode,
> -INTVAL (operands[2]));
> -return "umov\\t%w0, %1.[%2]";
> -  }
> -  [(set_attr "type" "neon_to_gp")]
> +  [(set_attr "type" "neon_to_gp")]
> +)
> +
> +(define_insn "*aarch64_get_lane_zero_extend"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> + (zero_extend:GPI
> +   (vec_select:
> + (match_operand:VDQQH 1 "register_operand" "w")
> + (parallel [(match_operand:SI 2 "immediate_operand" "i")]]
> +  "TARGET_SIMD"
> +  {
> +operands[2] = aarch64_endian_lane_rtx (mode,
> +INTVAL (operands[2]));
> +return "umov\\t%w0, %1.[%2]";
> +  }
> +  [(set_attr "type" "neon_to_gp")]
>  )
>  
>  ;; Lane extraction of a value, neither sign nor zero extension

These changes should be dropped.


diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
eeed08e71ca0b96726cb28743ef38487a8287600..aba6af24eee1c29fe4524eb352747c94617b30c7
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -986,6 +986,18 @@
   [(set_attr "type" "neon_shift_imm")]
 )
 
+(define_insn "*aarch64_simd_sra"
+ [(set (match_operand:VDQ_I 0 "register_operand" "=w")
+   (plus:VDQ_I
+  (SHIFTRT:VDQ_I
+   (match_operand:VDQ_I 1 "register_operand" "w")
+   (match_operand:VDQ_I 2 "aarch64_simd_rshift_imm" "Dr"))
+  (match_operand:VDQ_I 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "sra\t%0., %1., %2"
+  [(set_attr "type" "neon_shift_acc")]
+)
+
 (define_insn "aarch64_simd_imm_shl"
  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
(ashift:VDQ_I (match_operand:VDQ_I 1 "regis

[patch][aarch64]: add usra and ssra combine patterns

2019-05-30 Thread Sylvia Taylor
Greetings,

This patch adds support to combine:

1) ushr and add into usra, example:

ushrv0.16b, v0.16b, 2
add v0.16b, v0.16b, v2.16b
---
usrav2.16b, v0.16b, 2

2) sshr and add into ssra, example:

sshrv1.16b, v1.16b, 2
add v1.16b, v1.16b, v3.16b
---
ssrav3.16b, v1.16b, 2

Bootstrapped and tested on aarch64-none-linux-gnu.

Ok for trunk? If yes, I don't have any commit rights,
so can someone please commit it on my behalf.

Cheers,
Syl

gcc/ChangeLog:

2019-05-30  Sylvia Taylor  

* config/aarch64/aarch64-simd.md
(*aarch64_simd_sra): New.
* config/aarch64/iterators.md
(SHIFTRT): New iterator.
(sra_op): New attribute.

gcc/testsuite/ChangeLog:

2019-05-30  Sylvia Taylor  

* gcc.target/aarch64/simd/ssra.c: New test.
* gcc.target/aarch64/simd/usra.c: New test.
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
e3852c5d182b70978d7603225fce55c0b8ee2894..502ac5f3b45a1da059bb07701150a531091378ed
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -953,6 +953,18 @@
   [(set_attr "type" "neon_shift_imm")]
 )
 
+(define_insn "*aarch64_simd_sra"
+ [(set (match_operand:VDQ_I 0 "register_operand" "=w")
+   (plus:VDQ_I
+  (SHIFTRT:VDQ_I
+   (match_operand:VDQ_I 1 "register_operand" "w")
+   (match_operand:VDQ_I 2 "aarch64_simd_rshift_imm" "Dr"))
+  (match_operand:VDQ_I 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "sra\t%0., %1., %2"
+  [(set_attr "type" "neon_shift_acc")]
+)
+
 (define_insn "aarch64_simd_imm_shl"
  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
(ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
@@ -3110,22 +3122,22 @@
 operands[2] = aarch64_endian_lane_rtx (mode, INTVAL (operands[2]));
 return "smov\\t%0, %1.[%2]";
   }
-  [(set_attr "type" "neon_to_gp")]
-)
-
-(define_insn "*aarch64_get_lane_zero_extend"
-  [(set (match_operand:GPI 0 "register_operand" "=r")
-   (zero_extend:GPI
- (vec_select:
-   (match_operand:VDQQH 1 "register_operand" "w")
-   (parallel [(match_operand:SI 2 "immediate_operand" "i")]]
-  "TARGET_SIMD"
-  {
-operands[2] = aarch64_endian_lane_rtx (mode,
-  INTVAL (operands[2]));
-return "umov\\t%w0, %1.[%2]";
-  }
-  [(set_attr "type" "neon_to_gp")]
+  [(set_attr "type" "neon_to_gp")]
+)
+
+(define_insn "*aarch64_get_lane_zero_extend"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+   (zero_extend:GPI
+ (vec_select:
+   (match_operand:VDQQH 1 "register_operand" "w")
+   (parallel [(match_operand:SI 2 "immediate_operand" "i")]]
+  "TARGET_SIMD"
+  {
+operands[2] = aarch64_endian_lane_rtx (mode,
+  INTVAL (operands[2]));
+return "umov\\t%w0, %1.[%2]";
+  }
+  [(set_attr "type" "neon_to_gp")]
 )
 
 ;; Lane extraction of a value, neither sign nor zero extension
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 
6caeeac80867edda29b5438efdcee475ed609ff6..6273b7be5932aef695d12e9f723a43cb6c50abe8
 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1160,6 +1160,8 @@
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
 
+(define_code_iterator SHIFTRT [ashiftrt lshiftrt])
+
 ;; Code iterator for logical operations
 (define_code_iterator LOGICAL [and ior xor])
 
@@ -1342,6 +1344,9 @@
 (define_code_attr shift [(ashift "lsl") (ashiftrt "asr")
 (lshiftrt "lsr") (rotatert "ror")])
 
+;; Op prefix for shift right and accumulate.
+(define_code_attr sra_op [(ashiftrt "s") (lshiftrt "u")])
+
 ;; Map shift operators onto underlying bit-field instructions
 (define_code_attr bfshift [(ashift "ubfiz") (ashiftrt "sbfx")
   (lshiftrt "ubfx") (rotatert "extr")])
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ssra.c 
b/gcc/testsuite/gcc.target/aarch64/simd/ssra.c
new file mode 100644
index 
..e9c2e04c0b88ac18be81f4ee8a872e6829af9db2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ssra.c
@@ -0,0 +1,36 @@
+/* { dg-do compile { target aarch64*-*-* } } */
+/* { dg-options "-

[patch][aarch64]: add support for fabd in sve

2019-05-30 Thread Sylvia Taylor
Greetings,

This patch adds support in SVE to combine:
- fsub and fabs into fabd

fsubz0.s, z0.s, z1.s
fabsz0.s, p1/m, z0.s
---
fabdz0.s, p1/m, z0.s, z1.s

Bootstrapped and tested on aarch64-none-linux-gnu.

Ok for trunk? If yes, I don't have commit rights,
so if someone can please commit it on my behalf.

Cheers,
Syl

gcc/ChangeLog:

2019-05-30  Sylvia Taylor  

* config/aarch64/aarch64-sve.md
(*fabd3): New.

gcc/testsuite/ChangeLog:

2019-05-30  Sylvia Taylor  

* gcc.target/aarch64/sve/fabd.c: New.
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
3f39c4c5b63798515ed4c109836b036573de4aad..4c46aa55dfc174424ff47447f26c44b038d768ea
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2528,6 +2528,19 @@
   "\t%0., %1/m, %2."
 )
 
+(define_insn "*fabd3"
+  [(set (match_operand:SVE_F 0 "register_operand" "=w")
+   (unspec:SVE_F
+ [(match_operand: 1 "register_operand" "Upl")
+  (abs:SVE_F
+   (minus:SVE_F
+   (match_operand:SVE_F 2 "register_operand" "0")
+   (match_operand:SVE_F 3 "register_operand" "w")))]
+   UNSPEC_MERGE_PTRUE))]
+  "TARGET_SVE"
+  "fabd\t%0., %1/m, %2., %3."
+)
+
 ;; Unpredicated FRINTy.
 (define_expand "2"
   [(set (match_operand:SVE_F 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fabd.c 
b/gcc/testsuite/gcc.target/aarch64/sve/fabd.c
new file mode 100644
index 
..13ad83be24ceb0d3319cb3bcfdbd6372b4d1a48e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fabd.c
@@ -0,0 +1,35 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+#define N 16
+
+typedef float *__restrict__ vnx4sf;
+typedef double *__restrict__ vnx2df;
+typedef _Float16 *__restrict__ vnx8hf_a;
+typedef __fp16 *__restrict__ vnx8hf_b;
+
+extern float fabsf (float);
+extern double fabs (double);
+
+#define FABD(type, abs, n) \
+   void fabd_##type (type res, type a, type b) \
+   {   \
+   int i;  \
+   for (i = 0; i < n; i++) \
+   res[i] = abs (a[i] - b[i]); \
+   }
+
+#define TEST_SVE_F_MODES(FUNC) \
+  FUNC (vnx2df, fabs, N)   \
+  FUNC (vnx4sf, fabsf, N)  \
+  FUNC (vnx8hf_a, fabsf, N)\
+  FUNC (vnx8hf_b, fabsf, N)\
+
+TEST_SVE_F_MODES (FABD)
+
+/* { dg-final { scan-assembler "fabd" } } */
+/* { dg-final { scan-assembler-not "fsub" } } */
+/* { dg-final { scan-assembler-not "fabs" } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m, 
z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */