[GCC][Patch]Bug fix: cannot convert 'const short int*' to 'const __bf16*'

2020-03-11 Thread Delia Burduv
This patch fixes a bug introduced by my earlier patch ( 
https://gcc.gnu.org/pipermail/gcc-patches/2020-March/541680.html ).
It introduces a new scalar builtin type that was missing in the original 
patch.


Bootstrapped cleanly on arm-none-linux-gnueabihf.
Tested for regression on arm-none-linux-gnueabihf. No regression from 
before the original patch.
Tests that failed or became unsupported because of the original tests 
now work as they did before it.


gcc/ChangeLog:

2020-03-11  Delia Burduv  

* config/arm/arm-builtins.c
(arm_init_simd_builtin_scalar_types): New
* config/arm/arm_neon.h (vld2_bf16): Used new builtin type
(vld2q_bf16): Used new builtin type
(vld3_bf16): Used new builtin type
(vld3q_bf16): Used new builtin type
(vld4_bf16): Used new builtin type
(vld4q_bf16): Used new builtin type
(vld2_dup_bf16): Used new builtin type
(vld2q_dup_bf16): Used new builtin type
(vld3_dup_bf16): Used new builtin type
(vld3q_dup_bf16): Used new builtin type
(vld4_dup_bf16): Used new builtin type
(vld4q_dup_bf16): Used new builtin type
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index e0561c5..1f55898 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -784,6 +784,7 @@ const char *arm_scalar_builtin_types[] = {
   "__builtin_neon_oi",
   "__builtin_neon_ci",
   "__builtin_neon_xi",
+  "__builtin_neon_bf",
   NULL
 };
 
@@ -1101,7 +1102,8 @@ arm_init_simd_builtin_scalar_types (void)
 	 "__builtin_neon_df");
   (*lang_hooks.types.register_builtin_type) (intTI_type_node,
 	 "__builtin_neon_ti");
-
+  (*lang_hooks.types.register_builtin_type) (arm_bf16_type_node,
+ "__builtin_neon_bf");
   /* Unsigned integer types for various mode sizes.  */
   (*lang_hooks.types.register_builtin_type) (unsigned_intQI_type_node,
 	 "__builtin_neon_uqi");
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index f5ccf18..aa21730 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -19562,7 +19562,7 @@ __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_bf16 (bfloat16_t const * __ptr)
 {
   union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v4bf ((const __builtin_neon_hi *) __ptr);
+  __rv.__o = __builtin_neon_vld2v4bf ((const __builtin_neon_bf *) __ptr);
   return __rv.__i;
 }
 
@@ -19571,7 +19571,7 @@ __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_bf16 (const bfloat16_t * __ptr)
 {
   union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v8bf ((const __builtin_neon_hi *) __ptr);
+  __rv.__o = __builtin_neon_vld2v8bf ((const __builtin_neon_bf *) __ptr);
   return __rv.__i;
 }
 
@@ -19580,7 +19580,7 @@ __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3_bf16 (const bfloat16_t * __ptr)
 {
   union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v4bf ((const __builtin_neon_hi *) __ptr);
+  __rv.__o = __builtin_neon_vld3v4bf ((const __builtin_neon_bf *) __ptr);
   return __rv.__i;
 }
 
@@ -19589,7 +19589,7 @@ __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld3q_bf16 (const bfloat16_t * __ptr)
 {
   union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v8bf ((const __builtin_neon_hi *) __ptr);
+  __rv.__o = __builtin_neon_vld3v8bf ((const __builtin_neon_bf *) __ptr);
   return __rv.__i;
 }
 
@@ -19598,7 +19598,7 @@ __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4_bf16 (const bfloat16_t * __ptr)
 {
   union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v4bf ((const __builtin_neon_hi *) __ptr);
+  __rv.__o = __builtin_neon_vld4v4bf ((const __builtin_neon_bf *) __ptr);
   return __rv.__i;
 }
 
@@ -19607,7 +19607,7 @@ __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld4q_bf16 (const bfloat16_t * __ptr)
 {
   union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v8bf ((const __builtin_neon_hi *) __ptr);
+  __rv.__o = __builtin_neon_vld4v8bf ((const __builtin_neon_bf *) __ptr);
   return __rv.__i;
 }
 
@@ -19616,7 +19616,7 @@ __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2_dup_bf16 (const bfloat16_t * __ptr)
 {
   union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_dupv4bf ((const __builtin_neon_hi *) __ptr);
+  __rv.__o = __builtin_neon_vld2_dupv4bf ((const __builtin_neon_bf *) __ptr);
   return __rv.__i;
 }
 
@@ -19625,7 +19625,7 @@ __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld2q_dup_bf16 (const bfloat16_t * __p

Re: [GCC][PATCH][AArch64] ACLE intrinsics for BFCVTN, BFCVTN2 (AArch64 AdvSIMD) and BFCVT (AArch64 FP)

2020-03-05 Thread Delia Burduv

Hi,

Here is the latest version of the  patch. That test should now work.

Thanks,
Delia

On 3/5/20 11:06 AM, Richard Sandiford wrote:

Hi,

Thanks for the update and sorry for the slow reply.

When I try the patch locally I get:

FAIL: gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c   -O0  (test for 
excess errors)
FAIL: gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c   -O1  (test for 
excess errors)
FAIL: gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c   -O2  (test for 
excess errors)
FAIL: gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c   -O2 -flto 
-fno-use-linker-plugin -flto-partition=none  (test for excess errors)
FAIL: gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c   -O2 -flto 
-fuse-linker-plugin -fno-fat-lto-objects  (test for excess errors)
FAIL: gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c   -O3 -g  (test for 
excess errors)
FAIL: gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c   -Og -g  (test for 
excess errors)
FAIL: gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c   -Os  (test for 
excess errors)

I think that's because:

Delia Burduv  writes:

diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
index 
3759c0d1cb449a7f0125cc2a1433127564d66622..fa7080c2953bc3254f01d842a8afef917d469080
 100644
--- a/gcc/config/aarch64/arm_bf16.h
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -27,6 +27,19 @@
  #ifndef _AARCH64_BF16_H_
  #define _AARCH64_BF16_H_
  
+#pragma GCC push_options

+#pragma GCC target ("+nothing+bf16")
+
  typedef __bf16 bfloat16_t;
+typedef float float32_t;
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvth_bf16_f32 (float32_t __a)
+{
+  return __builtin_aarch64_bfcvtbf (__a);
+}
+
+#pragma GCC pop_options


"+bf16" implicitly enables "+simd", so functions guarded with
"+nothing+bf16" are only available when "+simd" is available.
I think we want "+nothing+bf16+nosimd" instead.  (Haven't tested
that though.)

Very minor, but: it might be clearer to leave the typedefs outside
of the #pragma block.  It doesn't make any difference to the behaviour,
but it emphasises that the typedefs really are available unconditionally.

Looks ready to go otherwise.

Thanks,
Richard

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index d8bb96f8ed60648477f952ea6b88eae67cc9c921..cc0bd0e6b592528e4b1559e9a3f5b0153511dffd 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -714,3 +714,9 @@
   VAR1 (TERNOP, simd_smmla, 0, v16qi)
   VAR1 (TERNOPU, simd_ummla, 0, v16qi)
   VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)
+
+  /* Implemented by aarch64_bfcvtn{q}{2}  */
+  VAR1 (UNOP, bfcvtn, 0, v4bf)
+  VAR1 (UNOP, bfcvtn_q, 0, v8bf)
+  VAR1 (BINOP, bfcvtn2, 0, v8bf)
+  VAR1 (UNOP, bfcvt, 0, bf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 89aaf8c018e3340dd2d53fc2a6538d3d1220b103..035f3163223d0b618fa28beb007f2f70c7d6c060 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7207,3 +7207,32 @@
   "mmla\\t%0.4s, %2.16b, %3.16b"
   [(set_attr "type" "neon_mla_s_q")]
 )
+
+;; bfcvtn
+(define_insn "aarch64_bfcvtn"
+  [(set (match_operand:V4SF_TO_BF 0 "register_operand" "=w")
+(unspec:V4SF_TO_BF [(match_operand:V4SF 1 "register_operand" "w")]
+UNSPEC_BFCVTN))]
+  "TARGET_BF16_SIMD"
+  "bfcvtn\\t%0.4h, %1.4s"
+  [(set_attr "type" "neon_fp_cvt_narrow_s_q")]
+)
+
+(define_insn "aarch64_bfcvtn2v8bf"
+  [(set (match_operand:V8BF 0 "register_operand" "=w")
+(unspec:V8BF [(match_operand:V8BF 1 "register_operand" "0")
+  (match_operand:V4SF 2 "register_operand" "w")]
+  UNSPEC_BFCVTN2))]
+  "TARGET_BF16_SIMD"
+  "bfcvtn2\\t%0.8h, %2.4s"
+  [(set_attr "type" "neon_fp_cvt_narrow_s_q")]
+)
+
+(define_insn "aarch64_bfcvtbf"
+  [(set (match_operand:BF 0 "register_operand" "=w")
+(unspec:BF [(match_operand:SF 1 "register_operand" "w")]
+UNSPEC_BFCVT))]
+  "TARGET_BF16_FP"
+  "bfcvt\\t%h0, %s1"
+  [(set_attr "type" "f_cvt")]
+)
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
index 3759c0d1cb449a7f0125cc2a1433127564d66622..984875dcc014300c489209c11abf41b1c47b7fbe 100644
--- a/gcc/config/aarch64/arm_bf16.h
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -28,5 +28,18 @@
 #define _AARCH64_BF16_H_
 
 typedef __bf16 bfloat16_t;
+typedef float float32_t;
+
+#pragma GCC push_options
+#pragma GCC targ

Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32

2020-03-05 Thread Delia Burduv

Hi,

This is the latest version of the patch. I am forcing -mfloat-abi=hard 
because the code generated is slightly differently depending on the 
float-abi used.


Thanks,
Delia

On 3/4/20 5:20 PM, Kyrill Tkachov wrote:

Hi Delia,

On 3/4/20 2:05 PM, Delia Burduv wrote:

Hi,

The previous version of this patch shared part of its code with the
store intrinsics patch
(https://gcc.gnu.org/ml/gcc-patches/2020-03/msg00145.html) so I removed
any duplicated code. This patch now depends on the previously mentioned
store intrinsics patch.

Here is the latest version and the updated ChangeLog.

gcc/ChangeLog:

2019-03-04  Delia Burduv  

    * config/arm/arm_neon.h (bfloat16_t): New typedef.
 (vld2_bf16): New.
    (vld2q_bf16): New.
    (vld3_bf16): New.
    (vld3q_bf16): New.
    (vld4_bf16): New.
    (vld4q_bf16): New.
    (vld2_dup_bf16): New.
    (vld2q_dup_bf16): New.
 (vld3_dup_bf16): New.
    (vld3q_dup_bf16): New.
    (vld4_dup_bf16): New.
    (vld4q_dup_bf16): New.
 * config/arm/arm_neon_builtins.def
 (vld2): Changed to VAR13 and added v4bf, v8bf
 (vld2_dup): Changed to VAR8 and added v4bf, v8bf
 (vld3): Changed to VAR13 and added v4bf, v8bf
 (vld3_dup): Changed to VAR8 and added v4bf, v8bf
 (vld4): Changed to VAR13 and added v4bf, v8bf
 (vld4_dup): Changed to VAR8 and added v4bf, v8bf
 * config/arm/iterators.md (VDXBF): New iterator.
 (VQ2BF): New iterator.
 *config/arm/neon.md (vld2): Used new iterators.
 (vld2_dup): Used new iterators.
 (vld2_dupv8bf): New.
 (vst3): Used new iterators.
 (vst3qa): Used new iterators.
 (vst3qb): Used new iterators.
 (vld3_dup): Used new iterators.
 (vld3_dupv8bf): New.
 (vst4): Used new iterators.
 (vst4qa): Used new iterators.
 (vst4qb): Used new iterators.
 (vld4_dup): Used new iterators.
 (vld4_dupv8bf): New.


gcc/testsuite/ChangeLog:

2019-03-04  Delia Burduv  

    * gcc.target/arm/simd/bf16_vldn_1.c: New test.

Thanks,
Delia

On 2/19/20 5:25 PM, Delia Burduv wrote:
>
> Hi,
>
> Here is the latest version of the patch. It just has some minor
> formatting changes that were brought up by Richard Sandiford in the
> AArch64 patches
>
> Thanks,
> Delia
>
> On 1/22/20 5:31 PM, Delia Burduv wrote:
>> Ping.
>>
>> I will change the tests to use the exact input and output registers as
>> Richard Sandiford suggested for the AArch64 patches.
>>
>> On 12/20/19 6:48 PM, Delia Burduv wrote:
>>> This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics
>>> vld{q}_bf16 as part of the BFloat16 extension.
>>> 
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) 


>>>
>>> The intrinsics are declared in arm_neon.h .
>>> A new test is added to check assembler output.
>>>
>>> This patch depends on the Arm back-end patche.
>>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>>
>>> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't
>>> have commit rights, so if this is ok can someone please commit it for
>>> me?
>>>
>>> gcc/ChangeLog:
>>>
>>> 2019-11-14  Delia Burduv 
>>>
>>>  * config/arm/arm_neon.h (bfloat16_t): New typedef.
>>>  (bfloat16x4x2_t): New typedef.
>>>  (bfloat16x8x2_t): New typedef.
>>>  (bfloat16x4x3_t): New typedef.
>>>  (bfloat16x8x3_t): New typedef.
>>>  (bfloat16x4x4_t): New typedef.
>>>  (bfloat16x8x4_t): New typedef.
>>>  (vld2_bf16): New.
>>>  (vld2q_bf16): New.
>>>  (vld3_bf16): New.
>>>  (vld3q_bf16): New.
>>>  (vld4_bf16): New.
>>>  (vld4q_bf16): New.
>>>  (vld2_dup_bf16): New.
>>>  (vld2q_dup_bf16): New.
>>>   (vld3_dup_bf16): New.
>>>  (vld3q_dup_bf16): New.
>>>  (vld4_dup_bf16): New.
>>>  (vld4q_dup_bf16): New.
>>>  * config/arm/arm-builtins.c (E_V2BFmode): New mode.
>>>  (VAR13): New.
>>>  (arm_simd_types[Bfloat16x2_t]):New type.
>>>  * config/arm/arm-modes.def (V2BF): New mode.
>>>  * config/arm/arm-simd-builtin-types.def
>>>  (Bfloat16x2_t): New entry.
>>>  * config/arm/arm_neon_builtins.def
>>>  (vld2): Changed to VAR13 and added v4bf, v8bf
>>>  (vld2_dup): Changed to VAR8 and added v4bf, v8bf
>>>  (vld3): Changed to VAR13 and added v4bf, v8bf
>>>  (vld3_dup): Changed to VAR8 an

Re: ACLE intrinsics: BFloat16 store (vst{q}_bf16) intrinsics for AArch32

2020-03-05 Thread Delia Burduv

Hi,

This is the latest version of the patch. I am forcing -mfloat-abi=hard 
because the register allocator behaves differently depending on the 
float-abi used.


Thanks,
Delia

On 3/4/20 5:20 PM, Kyrill Tkachov wrote:

Hi Delia,

On 3/3/20 5:23 PM, Delia Burduv wrote:

Hi,

I noticed that the patch doesn't apply cleanly. I fixed it and this is 
the latest version.


Thanks,
Delia

On 3/3/20 4:23 PM, Delia Burduv wrote:

Sorry, I forgot the attachment.

On 3/3/20 4:20 PM, Delia Burduv wrote:

Hi,

I made a mistake in the previous patch. This is the latest version. 
Please let me know if it is ok.


Thanks,
Delia

On 2/21/20 3:18 PM, Delia Burduv wrote:

Hi Kyrill,

The arm_bf16.h is only used for scalar operations. That is how the 
aarch64 versions are implemented too.


Thanks,
Delia

On 2/21/20 2:06 PM, Kyrill Tkachov wrote:

Hi Delia,

On 2/19/20 5:25 PM, Delia Burduv wrote:

Hi,

Here is the latest version of the patch. It just has some minor
formatting changes that were brought up by Richard Sandiford in the
AArch64 patches

Thanks,
Delia

On 1/22/20 5:29 PM, Delia Burduv wrote:
> Ping.
>
> I will change the tests to use the exact input and output 
registers as

> Richard Sandiford suggested for the AArch64 patches.
>
> On 12/20/19 6:46 PM, Delia Burduv wrote:
>> This patch adds the ARMv8.6 ACLE BFloat16 store intrinsics
>> vst{q}_bf16 as part of the BFloat16 extension.
>> 
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) 


>>
>> The intrinsics are declared in arm_neon.h .
>> A new test is added to check assembler output.
>>
>> This patch depends on the Arm back-end patche.
>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>
>> Tested for regression on arm-none-eabi and armeb-none-eabi. I 
don't
>> have commit rights, so if this is ok can someone please commit 
it for me?

>>
>> gcc/ChangeLog:
>>
>> 2019-11-14  Delia Burduv 
>>
>>  * config/arm/arm_neon.h (bfloat16_t): New typedef.
>>  (bfloat16x4x2_t): New typedef.
>>  (bfloat16x8x2_t): New typedef.
>>  (bfloat16x4x3_t): New typedef.
>>  (bfloat16x8x3_t): New typedef.
>>  (bfloat16x4x4_t): New typedef.
>>  (bfloat16x8x4_t): New typedef.
>>  (vst2_bf16): New.
>>  (vst2q_bf16): New.
>>  (vst3_bf16): New.
>>  (vst3q_bf16): New.
>>  (vst4_bf16): New.
>>  (vst4q_bf16): New.
>>  * config/arm/arm-builtins.c (E_V2BFmode): New mode.
>>  (VAR13): New.
>>  (arm_simd_types[Bfloat16x2_t]):New type.
>>  * config/arm/arm-modes.def (V2BF): New mode.
>>  * config/arm/arm-simd-builtin-types.def
>>  (Bfloat16x2_t): New entry.
>>  * config/arm/arm_neon_builtins.def
>>  (vst2): Changed to VAR13 and added v4bf, v8bf
>>  (vst3): Changed to VAR13 and added v4bf, v8bf
>>  (vst4): Changed to VAR13 and added v4bf, v8bf
>>  * config/arm/iterators.md (VDXBF): New iterator.
>>  (VQ2BF): New iterator.
>>  (V_elem): Added V4BF, V8BF.
>>  (V_sz_elem): Added V4BF, V8BF.
>>  (V_mode_nunits): Added V4BF, V8BF.
>>  (q): Added V4BF, V8BF.
>>  *config/arm/neon.md (vst2): Used new iterators.
>>      (vst3): Used new iterators.
>>  (vst3qa): Used new iterators.
>>  (vst3qb): Used new iterators.
>>  (vst4): Used new iterators.
>>  (vst4qa): Used new iterators.
>>  (vst4qb): Used new iterators.
>>
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2019-11-14  Delia Burduv 
>>
>>  * gcc.target/arm/simd/bf16_vstn_1.c: New test.


One thing I just noticed in this and the other arm bfloat16 
patches...


diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 
3c78f435009ab027f92693d00ab5b40960d5419d..fd81c18948db3a7f6e8e863d32511f75bf950e6a 
100644

--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,89 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, 
float32x4_t __a, float32x4_t __b,

    return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
  }

+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+typedef struct bfloat16x4x2_t
+{
+  bfloat16x4_t val[2];
+} bfloat16x4x2_t;


These should be in a new arm_bf16.h file that gets included in the 
main arm_neon.h file, right?

I believe the aarch64 versions are implemented that way.

Otherwise the patch looks good to me.
Thanks!
Kyrill


  +
+typedef struct bfloat16x8x2_t
+{
+  bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+



diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c 
b/gc

Re: ACLE intrinsics: BFloat16 store (vst{q}_bf16) intrinsics for AArch32

2020-03-05 Thread Delia Burduv

Hi,

This is the latest version of the patch. I am forcing -mfloat-abi=hard 
because the register allocator behaves differently depending on which 
float-abi is used.


Thanks,
Delia

On 3/4/20 5:20 PM, Kyrill Tkachov wrote:

Hi Delia,

On 3/3/20 5:23 PM, Delia Burduv wrote:

Hi,

I noticed that the patch doesn't apply cleanly. I fixed it and this is 
the latest version.


Thanks,
Delia

On 3/3/20 4:23 PM, Delia Burduv wrote:

Sorry, I forgot the attachment.

On 3/3/20 4:20 PM, Delia Burduv wrote:

Hi,

I made a mistake in the previous patch. This is the latest version. 
Please let me know if it is ok.


Thanks,
Delia

On 2/21/20 3:18 PM, Delia Burduv wrote:

Hi Kyrill,

The arm_bf16.h is only used for scalar operations. That is how the 
aarch64 versions are implemented too.


Thanks,
Delia

On 2/21/20 2:06 PM, Kyrill Tkachov wrote:

Hi Delia,

On 2/19/20 5:25 PM, Delia Burduv wrote:

Hi,

Here is the latest version of the patch. It just has some minor
formatting changes that were brought up by Richard Sandiford in the
AArch64 patches

Thanks,
Delia

On 1/22/20 5:29 PM, Delia Burduv wrote:
> Ping.
>
> I will change the tests to use the exact input and output 
registers as

> Richard Sandiford suggested for the AArch64 patches.
>
> On 12/20/19 6:46 PM, Delia Burduv wrote:
>> This patch adds the ARMv8.6 ACLE BFloat16 store intrinsics
>> vst{q}_bf16 as part of the BFloat16 extension.
>> 
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) 


>>
>> The intrinsics are declared in arm_neon.h .
>> A new test is added to check assembler output.
>>
>> This patch depends on the Arm back-end patche.
>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>
>> Tested for regression on arm-none-eabi and armeb-none-eabi. I 
don't
>> have commit rights, so if this is ok can someone please commit 
it for me?

>>
>> gcc/ChangeLog:
>>
>> 2019-11-14  Delia Burduv 
>>
>>  * config/arm/arm_neon.h (bfloat16_t): New typedef.
>>  (bfloat16x4x2_t): New typedef.
>>  (bfloat16x8x2_t): New typedef.
>>  (bfloat16x4x3_t): New typedef.
>>  (bfloat16x8x3_t): New typedef.
>>  (bfloat16x4x4_t): New typedef.
>>  (bfloat16x8x4_t): New typedef.
>>  (vst2_bf16): New.
>>  (vst2q_bf16): New.
>>  (vst3_bf16): New.
>>  (vst3q_bf16): New.
>>  (vst4_bf16): New.
>>  (vst4q_bf16): New.
>>  * config/arm/arm-builtins.c (E_V2BFmode): New mode.
>>  (VAR13): New.
>>  (arm_simd_types[Bfloat16x2_t]):New type.
>>  * config/arm/arm-modes.def (V2BF): New mode.
>>  * config/arm/arm-simd-builtin-types.def
>>  (Bfloat16x2_t): New entry.
>>  * config/arm/arm_neon_builtins.def
>>  (vst2): Changed to VAR13 and added v4bf, v8bf
>>  (vst3): Changed to VAR13 and added v4bf, v8bf
>>  (vst4): Changed to VAR13 and added v4bf, v8bf
>>  * config/arm/iterators.md (VDXBF): New iterator.
>>  (VQ2BF): New iterator.
>>  (V_elem): Added V4BF, V8BF.
>>  (V_sz_elem): Added V4BF, V8BF.
>>  (V_mode_nunits): Added V4BF, V8BF.
>>  (q): Added V4BF, V8BF.
>>  *config/arm/neon.md (vst2): Used new iterators.
>>      (vst3): Used new iterators.
>>  (vst3qa): Used new iterators.
>>  (vst3qb): Used new iterators.
>>  (vst4): Used new iterators.
>>  (vst4qa): Used new iterators.
>>  (vst4qb): Used new iterators.
>>
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2019-11-14  Delia Burduv 
>>
>>  * gcc.target/arm/simd/bf16_vstn_1.c: New test.


One thing I just noticed in this and the other arm bfloat16 
patches...


diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 
3c78f435009ab027f92693d00ab5b40960d5419d..fd81c18948db3a7f6e8e863d32511f75bf950e6a 
100644

--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,89 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, 
float32x4_t __a, float32x4_t __b,

    return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
  }

+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+typedef struct bfloat16x4x2_t
+{
+  bfloat16x4_t val[2];
+} bfloat16x4x2_t;


These should be in a new arm_bf16.h file that gets included in the 
main arm_neon.h file, right?

I believe the aarch64 versions are implemented that way.

Otherwise the patch looks good to me.
Thanks!
Kyrill


  +
+typedef struct bfloat16x8x2_t
+{
+  bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+



diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c 
b/gc

Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma for AArch32 AdvSIMD

2020-03-04 Thread Delia Burduv

Hi,

This is the latest version of the patch.

Thanks,
Delia

On 2/21/20 11:41 AM, Kyrill Tkachov wrote:

Hi Delia,

On 2/19/20 5:23 PM, Delia Burduv wrote:

Hi,

Here is the latest version of the patch. It just has some minor 
formatting changes that were brought up by Richard Sandiford in the 
AArch64 patches


Thanks,
Delia

On 1/31/20 3:23 PM, Delia Burduv wrote:
Here is the updated patch. The changes are minor, so let me know if 
there is anything else to fix or if it can be committed.


Thank you,
Delia

On 1/30/20 2:55 PM, Kyrill Tkachov wrote:

Hi Delia,


On 1/28/20 4:44 PM, Delia Burduv wrote:

Ping.
 


*From:* Delia Burduv 
*Sent:* 22 January 2020 17:26
*To:* gcc-patches@gcc.gnu.org 
*Cc:* ni...@redhat.com ; Richard Earnshaw 
; Ramana Radhakrishnan 
; Kyrylo Tkachov 

*Subject:* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla 
and vfma for AArch32 AdvSIMD

Ping.

I have read Richard Sandiford's comments on the AArch64 patches and I
will apply what is relevant to this patch as well. Particularly, I 
will
change the tests to use the exact input and output registers and I 
will

change the types of the rtl patterns.



Please send the updated patches so that someone can commit them for 
you once they're reviewed.


Thanks,

Kyrill




On 12/20/19 6:44 PM, Delia Burduv wrote:
> This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and 
vfmat

> as part of the BFloat16 extension.
> (https://developer.arm.com/docs/101028/latest.)
> The intrinsics are declared in arm_neon.h and the RTL patterns are
> defined in neon.md.
> Two new tests are added to check assembler output and lane indices.
>
> This patch depends on the Arm back-end patche.
> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>
> Tested for regression on arm-none-eabi and armeb-none-eabi. I 
don't have

> commit rights, so if this is ok can someone please commit it for me?
>
> gcc/ChangeLog:
>
> 2019-11-12� Delia Burduv 
>
>� ����* config/arm/arm_neon.h (vbfmmlaq_f32): New.
>� ����� (vbfmlalbq_f32): New.
>� ����� (vbfmlaltq_f32): New.
>� ����� (vbfmlalbq_lane_f32): New.
>� ����� (vbfmlaltq_lane_f32): New.
>� ������� (vbfmlalbq_laneq_f32): New.
>� ����� (vbfmlaltq_laneq_f32): New.
>� ����* config/arm/arm_neon_builtins.def (vbfmmla): New.
>� ��������� (vbfmab): New.
>� ��������� (vbfmat): New.
>� ��������� (vbfmab_lane): New.
>� ��������� (vbfmat_lane): New.
>� ��������� (vbfmab_laneq): New.
>� ��������� (vbfmat_laneq): New.
>� ���� * config/arm/iterators.md (BF_MA): New int iterator.
>� ��������� (bt): New int attribute.
>� ��������� (VQXBF): Copy of VQX with V8BF.
>� ��������� (V_HALF): Added V8BF.
>� ����� * config/arm/neon.md (neon_vbfmmlav8hi): New 
insn.

>� ��������� (neon_vbfmav8hi): New insn.
>� ��������� (neon_vbfma_lanev8hi): New insn.
>� ��������� (neon_vbfma_laneqv8hi): New 
expand.
>� ��������� (neon_vget_high): Changed 
iterator to VQXBF.

>� ����* config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
>� ��������� (UNSPEC_BFMAB): New UNSPEC.
>� ��������� (UNSPEC_BFMAT): New UNSPEC.
>
> 2019-11-12� Delia Burduv 
>
>� ������� * gcc.target/arm/simd/bf16_ma_1.c: New 
test.
>� ������� * gcc.target/arm/simd/bf16_ma_2.c: New 
test.
>� ������� * gcc.target/arm/simd/bf16_mmla_1.c: New 
test.


This looks good, a few minor things though...


diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 
3c78f435009ab027f92693d00ab5b40960d5419d..81f8008ea6a5fb11eb09f6685ba24bb0c54fb248 
100644

--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,64 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, 
float32x4_t __a, float32x4_t __b,

 �� return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
 �}

+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+� return __builtin_neon_vbfmmlav8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+� return __builtin_neon_vbfmabv8bf (__r, __a, __b);
+}
+
+__extension__ ext

Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32

2020-03-04 Thread Delia Burduv

Hi,

The previous version of this patch shared part of its code with the 
store intrinsics patch 
(https://gcc.gnu.org/ml/gcc-patches/2020-03/msg00145.html) so I removed 
any duplicated code. This patch now depends on the previously mentioned 
store intrinsics patch.


Here is the latest version and the updated ChangeLog.

gcc/ChangeLog:

2019-03-04  Delia Burduv  

* config/arm/arm_neon.h (bfloat16_t): New typedef.
(vld2_bf16): New.
(vld2q_bf16): New.
(vld3_bf16): New.
(vld3q_bf16): New.
(vld4_bf16): New.
(vld4q_bf16): New.
(vld2_dup_bf16): New.
(vld2q_dup_bf16): New.
(vld3_dup_bf16): New.
(vld3q_dup_bf16): New.
(vld4_dup_bf16): New.
(vld4q_dup_bf16): New.
* config/arm/arm_neon_builtins.def
(vld2): Changed to VAR13 and added v4bf, v8bf
(vld2_dup): Changed to VAR8 and added v4bf, v8bf
(vld3): Changed to VAR13 and added v4bf, v8bf
(vld3_dup): Changed to VAR8 and added v4bf, v8bf
(vld4): Changed to VAR13 and added v4bf, v8bf
(vld4_dup): Changed to VAR8 and added v4bf, v8bf
* config/arm/iterators.md (VDXBF): New iterator.
(VQ2BF): New iterator.
*config/arm/neon.md (vld2): Used new iterators.
(vld2_dup): Used new iterators.
(vld2_dupv8bf): New.
(vst3): Used new iterators.
(vst3qa): Used new iterators.
(vst3qb): Used new iterators.
(vld3_dup): Used new iterators.
(vld3_dupv8bf): New.
(vst4): Used new iterators.
(vst4qa): Used new iterators.
(vst4qb): Used new iterators.
(vld4_dup): Used new iterators.
(vld4_dupv8bf): New.


gcc/testsuite/ChangeLog:

2019-03-04  Delia Burduv  

* gcc.target/arm/simd/bf16_vldn_1.c: New test.

Thanks,
Delia

On 2/19/20 5:25 PM, Delia Burduv wrote:


Hi,

Here is the latest version of the patch. It just has some minor 
formatting changes that were brought up by Richard Sandiford in the 
AArch64 patches


Thanks,
Delia

On 1/22/20 5:31 PM, Delia Burduv wrote:

Ping.

I will change the tests to use the exact input and output registers as 
Richard Sandiford suggested for the AArch64 patches.


On 12/20/19 6:48 PM, Delia Burduv wrote:
This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics 
vld{q}_bf16 as part of the BFloat16 extension.
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) 


The intrinsics are declared in arm_neon.h .
A new test is added to check assembler output.

This patch depends on the Arm back-end patche. 
(https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)


Tested for regression on arm-none-eabi and armeb-none-eabi. I don't 
have commit rights, so if this is ok can someone please commit it for 
me?


gcc/ChangeLog:

2019-11-14  Delia Burduv  

 * config/arm/arm_neon.h (bfloat16_t): New typedef.
 (bfloat16x4x2_t): New typedef.
 (bfloat16x8x2_t): New typedef.
 (bfloat16x4x3_t): New typedef.
 (bfloat16x8x3_t): New typedef.
 (bfloat16x4x4_t): New typedef.
 (bfloat16x8x4_t): New typedef.
 (vld2_bf16): New.
 (vld2q_bf16): New.
 (vld3_bf16): New.
 (vld3q_bf16): New.
 (vld4_bf16): New.
 (vld4q_bf16): New.
 (vld2_dup_bf16): New.
 (vld2q_dup_bf16): New.
  (vld3_dup_bf16): New.
 (vld3q_dup_bf16): New.
 (vld4_dup_bf16): New.
 (vld4q_dup_bf16): New.
 * config/arm/arm-builtins.c (E_V2BFmode): New mode.
 (VAR13): New.
 (arm_simd_types[Bfloat16x2_t]):New type.
 * config/arm/arm-modes.def (V2BF): New mode.
 * config/arm/arm-simd-builtin-types.def
 (Bfloat16x2_t): New entry.
 * config/arm/arm_neon_builtins.def
 (vld2): Changed to VAR13 and added v4bf, v8bf
 (vld2_dup): Changed to VAR8 and added v4bf, v8bf
 (vld3): Changed to VAR13 and added v4bf, v8bf
 (vld3_dup): Changed to VAR8 and added v4bf, v8bf
 (vld4): Changed to VAR13 and added v4bf, v8bf
 (vld4_dup): Changed to VAR8 and added v4bf, v8bf
 * config/arm/iterators.md (VDXBF): New iterator.
 (VQ2BF): New iterator.
 (V_elem): Added V4BF, V8BF.
 (V_sz_elem): Added V4BF, V8BF.
 (V_mode_nunits): Added V4BF, V8BF.
 (q): Added V4BF, V8BF.
 *config/arm/neon.md (vld2): Used new iterators.
 (vld2_dup): Used new iterators.
 (vld2_dupv8bf): New.
 (vst3): Used new iterators.
 (vst3qa): Used new iterators.
 (vst3qb): Used new iterators.
 (vld3_dup): Used new iterators.
 (vld3_dupv8bf): New.
 (vst4): Used new iterators.
 (vst4qa): Used new iterators.
 (vst4qb): Used new iterators.
 (vld4_dup): Used new iterators.
 (vld4_dupv8bf): New.


gcc/testsuite/ChangeLog:

2019-11-14  Delia Burduv  

 * gcc.target/arm/simd/bf16_vldn_1.c: New test.
diff --git a/gcc

Re: ACLE intrinsics: BFloat16 store (vst{q}_bf16) intrinsics for AArch32

2020-03-03 Thread Delia Burduv

Hi,

I noticed that the patch doesn't apply cleanly. I fixed it and this is 
the latest version.


Thanks,
Delia

On 3/3/20 4:23 PM, Delia Burduv wrote:

Sorry, I forgot the attachment.

On 3/3/20 4:20 PM, Delia Burduv wrote:

Hi,

I made a mistake in the previous patch. This is the latest version. 
Please let me know if it is ok.


Thanks,
Delia

On 2/21/20 3:18 PM, Delia Burduv wrote:

Hi Kyrill,

The arm_bf16.h is only used for scalar operations. That is how the 
aarch64 versions are implemented too.


Thanks,
Delia

On 2/21/20 2:06 PM, Kyrill Tkachov wrote:

Hi Delia,

On 2/19/20 5:25 PM, Delia Burduv wrote:

Hi,

Here is the latest version of the patch. It just has some minor
formatting changes that were brought up by Richard Sandiford in the
AArch64 patches

Thanks,
Delia

On 1/22/20 5:29 PM, Delia Burduv wrote:
> Ping.
>
> I will change the tests to use the exact input and output 
registers as

> Richard Sandiford suggested for the AArch64 patches.
>
> On 12/20/19 6:46 PM, Delia Burduv wrote:
>> This patch adds the ARMv8.6 ACLE BFloat16 store intrinsics
>> vst{q}_bf16 as part of the BFloat16 extension.
>> 
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) 


>>
>> The intrinsics are declared in arm_neon.h .
>> A new test is added to check assembler output.
>>
>> This patch depends on the Arm back-end patche.
>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>
>> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't
>> have commit rights, so if this is ok can someone please commit 
it for me?

>>
>> gcc/ChangeLog:
>>
>> 2019-11-14  Delia Burduv 
>>
>>  * config/arm/arm_neon.h (bfloat16_t): New typedef.
>>  (bfloat16x4x2_t): New typedef.
>>  (bfloat16x8x2_t): New typedef.
>>  (bfloat16x4x3_t): New typedef.
>>  (bfloat16x8x3_t): New typedef.
>>  (bfloat16x4x4_t): New typedef.
>>  (bfloat16x8x4_t): New typedef.
>>  (vst2_bf16): New.
>>  (vst2q_bf16): New.
>>  (vst3_bf16): New.
>>  (vst3q_bf16): New.
>>  (vst4_bf16): New.
>>  (vst4q_bf16): New.
>>  * config/arm/arm-builtins.c (E_V2BFmode): New mode.
>>  (VAR13): New.
>>  (arm_simd_types[Bfloat16x2_t]):New type.
>>  * config/arm/arm-modes.def (V2BF): New mode.
>>  * config/arm/arm-simd-builtin-types.def
>>  (Bfloat16x2_t): New entry.
>>  * config/arm/arm_neon_builtins.def
>>  (vst2): Changed to VAR13 and added v4bf, v8bf
>>  (vst3): Changed to VAR13 and added v4bf, v8bf
>>  (vst4): Changed to VAR13 and added v4bf, v8bf
>>  * config/arm/iterators.md (VDXBF): New iterator.
>>  (VQ2BF): New iterator.
>>  (V_elem): Added V4BF, V8BF.
>>  (V_sz_elem): Added V4BF, V8BF.
>>  (V_mode_nunits): Added V4BF, V8BF.
>>  (q): Added V4BF, V8BF.
>>  *config/arm/neon.md (vst2): Used new iterators.
>>      (vst3): Used new iterators.
>>  (vst3qa): Used new iterators.
>>  (vst3qb): Used new iterators.
>>  (vst4): Used new iterators.
>>  (vst4qa): Used new iterators.
>>  (vst4qb): Used new iterators.
>>
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2019-11-14  Delia Burduv 
>>
>>  * gcc.target/arm/simd/bf16_vstn_1.c: New test.


One thing I just noticed in this and the other arm bfloat16 patches...

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 
3c78f435009ab027f92693d00ab5b40960d5419d..fd81c18948db3a7f6e8e863d32511f75bf950e6a 
100644

--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,89 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, 
float32x4_t __a, float32x4_t __b,

    return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
  }

+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+typedef struct bfloat16x4x2_t
+{
+  bfloat16x4_t val[2];
+} bfloat16x4x2_t;


These should be in a new arm_bf16.h file that gets included in the 
main arm_neon.h file, right?

I believe the aarch64 versions are implemented that way.

Otherwise the patch looks good to me.
Thanks!
Kyrill


  +
+typedef struct bfloat16x8x2_t
+{
+  bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+

diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 4d31405cf6e09e3a61faa3e8142940bbdb23c60a..e0561c58fb3367876ce0164880df76f7331ec4e8 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -342,6 +342,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define v4bf_UP  E_V4BFmode
 #define

Re: [GCC][PATCH][AArch64] ACLE intrinsics for BFCVTN, BFCVTN2 (AArch64 AdvSIMD) and BFCVT (AArch64 FP)

2020-03-03 Thread Delia Burduv

Hi,

Here is the latest version of the patch.

On 2/18/20 1:51 PM, Richard Sandiford wrote:

Tamar Christina  writes:

Hi Richard,


..ffb5305e2e5ea1aadae07e82f
d8e

d6f9f247c1a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compil
+++ e.c
@@ -0,0 +1,48 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */


The { target ... } isn't necessary here.  (Missed that in the other review, 
sorry.)



Why not? The advsimd-intrinsics tests are shared between both AArch32 and 
AArch64.


Ah, so they are.  Think it would better to move them to a new
gcc.target/arm-common or something in that case.  Tests in
gcc.target/aarch64 really ought to be specific to aarch64.

Thanks,
Richard



I left the advsimd-intrinsics tests shared since creating a new 
gcc.target/arm-common should probably be a separate patch.


Let me know if this patch is ok. And if it is, can someone please commit 
it for me?


Thanks,
Delia



Tamar.


+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include 
+
+/*
+**test_bfcvtn:
+** bfcvtn\tv0.4h, v0.4s


Like with the other review, I think the literal tab you had in the original 
patch
looks better than \t.


[...]
diff --git
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c
new file mode 100644
index


..8d7dffe16275de60e884c449af
a0

fea0b1af6081
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd
+++ .c
@@ -0,0 +1,15 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */


This needs:

/* { dg-require-effective-target aarch64_asm_bf16_ok } */

(Doesn't exist yet, but I hope to post a patch soon.)

Looks good otherwise, thanks.

Richard
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index a118f4f121de067c0a80f691b852247b0ab27f7a..c1e364b4d1cb7a207c1de5a409a08e18a405a107 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -692,3 +692,9 @@
   VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
   VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
   VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
+
+  /* Implemented by aarch64_bfcvtn{q}{2}  */
+  VAR1 (UNOP, bfcvtn, 0, v4bf)
+  VAR1 (UNOP, bfcvtn_q, 0, v8bf)
+  VAR1 (BINOP, bfcvtn2, 0, v8bf)
+  VAR1 (UNOP, bfcvt, 0, bf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 97f46f96968a6bc2f93bbc812931537b819b3b19..111e48ea6b70548158ba696d997a2f2fc3cb2769 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7091,3 +7091,32 @@
 }
   [(set_attr "type" "neon_dot")]
 )
+
+;; bfcvtn
+(define_insn "aarch64_bfcvtn"
+  [(set (match_operand:V4SF_TO_BF 0 "register_operand" "=w")
+(unspec:V4SF_TO_BF [(match_operand:V4SF 1 "register_operand" "w")]
+UNSPEC_BFCVTN))]
+  "TARGET_BF16_SIMD"
+  "bfcvtn\\t%0.4h, %1.4s"
+  [(set_attr "type" "neon_fp_cvt_narrow_s_q")]
+)
+
+(define_insn "aarch64_bfcvtn2v8bf"
+  [(set (match_operand:V8BF 0 "register_operand" "=w")
+(unspec:V8BF [(match_operand:V8BF 1 "register_operand" "0")
+  (match_operand:V4SF 2 "register_operand" "w")]
+  UNSPEC_BFCVTN2))]
+  "TARGET_BF16_SIMD"
+  "bfcvtn2\\t%0.8h, %2.4s"
+  [(set_attr "type" "neon_fp_cvt_narrow_s_q")]
+)
+
+(define_insn "aarch64_bfcvtbf"
+  [(set (match_operand:BF 0 "register_operand" "=w")
+(unspec:BF [(match_operand:SF 1 "register_operand" "w")]
+UNSPEC_BFCVT))]
+  "TARGET_BF16_FP"
+  "bfcvt\\t%h0, %s1"
+  [(set_attr "type" "f_cvt")]
+)
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
index 3759c0d1cb449a7f0125cc2a1433127564d66622..fa7080c2953bc3254f01d842a8afef917d469080 100644
--- a/gcc/config/aarch64/arm_bf16.h
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -27,6 +27,19 @@
 #ifndef _AARCH64_BF16_H_
 #define _AARCH64_BF16_H_
 
+#pragma GCC push_options
+#pragma GCC target ("+nothing+bf16")
+
 typedef __bf16 bfloat16_t;
+typedef float float32_t;
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvth_bf16_f32 (float32_t __a)
+{
+  return __builtin_aarch64_bfcvtbf (__a);
+}
+
+#pragma GCC pop_options
 
 #endif
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 7f05c3f9eca844b0e7b824a191223a4906c825b1..36f82743231a7160050695267e75a08e0cd73e03 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34660,6 +34660,27 @@ vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t 

Re: ACLE intrinsics: BFloat16 store (vst{q}_bf16) intrinsics for AArch32

2020-03-03 Thread Delia Burduv

Sorry, I forgot the attachment.

On 3/3/20 4:20 PM, Delia Burduv wrote:

Hi,

I made a mistake in the previous patch. This is the latest version. 
Please let me know if it is ok.


Thanks,
Delia

On 2/21/20 3:18 PM, Delia Burduv wrote:

Hi Kyrill,

The arm_bf16.h is only used for scalar operations. That is how the 
aarch64 versions are implemented too.


Thanks,
Delia

On 2/21/20 2:06 PM, Kyrill Tkachov wrote:

Hi Delia,

On 2/19/20 5:25 PM, Delia Burduv wrote:

Hi,

Here is the latest version of the patch. It just has some minor
formatting changes that were brought up by Richard Sandiford in the
AArch64 patches

Thanks,
Delia

On 1/22/20 5:29 PM, Delia Burduv wrote:
> Ping.
>
> I will change the tests to use the exact input and output 
registers as

> Richard Sandiford suggested for the AArch64 patches.
>
> On 12/20/19 6:46 PM, Delia Burduv wrote:
>> This patch adds the ARMv8.6 ACLE BFloat16 store intrinsics
>> vst{q}_bf16 as part of the BFloat16 extension.
>> 
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) 


>>
>> The intrinsics are declared in arm_neon.h .
>> A new test is added to check assembler output.
>>
>> This patch depends on the Arm back-end patche.
>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>
>> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't
>> have commit rights, so if this is ok can someone please commit it 
for me?

>>
>> gcc/ChangeLog:
>>
>> 2019-11-14  Delia Burduv 
>>
>>  * config/arm/arm_neon.h (bfloat16_t): New typedef.
>>  (bfloat16x4x2_t): New typedef.
>>  (bfloat16x8x2_t): New typedef.
>>  (bfloat16x4x3_t): New typedef.
>>  (bfloat16x8x3_t): New typedef.
>>  (bfloat16x4x4_t): New typedef.
>>  (bfloat16x8x4_t): New typedef.
>>  (vst2_bf16): New.
>>  (vst2q_bf16): New.
>>  (vst3_bf16): New.
>>  (vst3q_bf16): New.
>>  (vst4_bf16): New.
>>  (vst4q_bf16): New.
>>  * config/arm/arm-builtins.c (E_V2BFmode): New mode.
>>  (VAR13): New.
>>  (arm_simd_types[Bfloat16x2_t]):New type.
>>  * config/arm/arm-modes.def (V2BF): New mode.
>>  * config/arm/arm-simd-builtin-types.def
>>  (Bfloat16x2_t): New entry.
>>  * config/arm/arm_neon_builtins.def
>>  (vst2): Changed to VAR13 and added v4bf, v8bf
>>  (vst3): Changed to VAR13 and added v4bf, v8bf
>>  (vst4): Changed to VAR13 and added v4bf, v8bf
>>  * config/arm/iterators.md (VDXBF): New iterator.
>>  (VQ2BF): New iterator.
>>  (V_elem): Added V4BF, V8BF.
>>  (V_sz_elem): Added V4BF, V8BF.
>>  (V_mode_nunits): Added V4BF, V8BF.
>>  (q): Added V4BF, V8BF.
>>  *config/arm/neon.md (vst2): Used new iterators.
>>      (vst3): Used new iterators.
>>  (vst3qa): Used new iterators.
>>  (vst3qb): Used new iterators.
>>  (vst4): Used new iterators.
>>  (vst4qa): Used new iterators.
>>  (vst4qb): Used new iterators.
>>
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2019-11-14  Delia Burduv 
>>
>>  * gcc.target/arm/simd/bf16_vstn_1.c: New test.


One thing I just noticed in this and the other arm bfloat16 patches...

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 
3c78f435009ab027f92693d00ab5b40960d5419d..fd81c18948db3a7f6e8e863d32511f75bf950e6a 
100644

--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,89 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, 
float32x4_t __a, float32x4_t __b,

    return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
  }

+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+typedef struct bfloat16x4x2_t
+{
+  bfloat16x4_t val[2];
+} bfloat16x4x2_t;


These should be in a new arm_bf16.h file that gets included in the 
main arm_neon.h file, right?

I believe the aarch64 versions are implemented that way.

Otherwise the patch looks good to me.
Thanks!
Kyrill


  +
+typedef struct bfloat16x8x2_t
+{
+  bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+

diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 7f279cca6688c6f11948159666ee647ae533c61d..44c6f46fd63d5eaa1c3c84340d9acd017bb663e4 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -318,6 +318,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define v4bf_UP  E_V4BFmode
 #define v2si_UP  E_V2SImode
 #define v2sf_UP  E_V2SFmode
+#define v2bf_UP  E_V2BFmode
 #define di_UPE_DImode
 #define v16qi_UP E_V16QImode
 #define v8hi_UP  E_V8

Re: ACLE intrinsics: BFloat16 store (vst{q}_bf16) intrinsics for AArch32

2020-03-03 Thread Delia Burduv

Hi,

I made a mistake in the previous patch. This is the latest version. 
Please let me know if it is ok.


Thanks,
Delia

On 2/21/20 3:18 PM, Delia Burduv wrote:

Hi Kyrill,

The arm_bf16.h is only used for scalar operations. That is how the 
aarch64 versions are implemented too.


Thanks,
Delia

On 2/21/20 2:06 PM, Kyrill Tkachov wrote:

Hi Delia,

On 2/19/20 5:25 PM, Delia Burduv wrote:

Hi,

Here is the latest version of the patch. It just has some minor
formatting changes that were brought up by Richard Sandiford in the
AArch64 patches

Thanks,
Delia

On 1/22/20 5:29 PM, Delia Burduv wrote:
> Ping.
>
> I will change the tests to use the exact input and output registers as
> Richard Sandiford suggested for the AArch64 patches.
>
> On 12/20/19 6:46 PM, Delia Burduv wrote:
>> This patch adds the ARMv8.6 ACLE BFloat16 store intrinsics
>> vst{q}_bf16 as part of the BFloat16 extension.
>> 
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) 


>>
>> The intrinsics are declared in arm_neon.h .
>> A new test is added to check assembler output.
>>
>> This patch depends on the Arm back-end patche.
>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>
>> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't
>> have commit rights, so if this is ok can someone please commit it 
for me?

>>
>> gcc/ChangeLog:
>>
>> 2019-11-14  Delia Burduv 
>>
>>  * config/arm/arm_neon.h (bfloat16_t): New typedef.
>>  (bfloat16x4x2_t): New typedef.
>>  (bfloat16x8x2_t): New typedef.
>>  (bfloat16x4x3_t): New typedef.
>>  (bfloat16x8x3_t): New typedef.
>>  (bfloat16x4x4_t): New typedef.
>>  (bfloat16x8x4_t): New typedef.
>>  (vst2_bf16): New.
>>  (vst2q_bf16): New.
>>  (vst3_bf16): New.
>>  (vst3q_bf16): New.
>>  (vst4_bf16): New.
>>  (vst4q_bf16): New.
>>  * config/arm/arm-builtins.c (E_V2BFmode): New mode.
>>  (VAR13): New.
>>  (arm_simd_types[Bfloat16x2_t]):New type.
>>  * config/arm/arm-modes.def (V2BF): New mode.
>>  * config/arm/arm-simd-builtin-types.def
>>  (Bfloat16x2_t): New entry.
>>  * config/arm/arm_neon_builtins.def
>>  (vst2): Changed to VAR13 and added v4bf, v8bf
>>  (vst3): Changed to VAR13 and added v4bf, v8bf
>>  (vst4): Changed to VAR13 and added v4bf, v8bf
>>  * config/arm/iterators.md (VDXBF): New iterator.
>>  (VQ2BF): New iterator.
>>  (V_elem): Added V4BF, V8BF.
>>  (V_sz_elem): Added V4BF, V8BF.
>>  (V_mode_nunits): Added V4BF, V8BF.
>>  (q): Added V4BF, V8BF.
>>  *config/arm/neon.md (vst2): Used new iterators.
>>      (vst3): Used new iterators.
>>  (vst3qa): Used new iterators.
>>  (vst3qb): Used new iterators.
>>  (vst4): Used new iterators.
>>  (vst4qa): Used new iterators.
>>  (vst4qb): Used new iterators.
>>
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2019-11-14  Delia Burduv 
>>
>>  * gcc.target/arm/simd/bf16_vstn_1.c: New test.


One thing I just noticed in this and the other arm bfloat16 patches...

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 
3c78f435009ab027f92693d00ab5b40960d5419d..fd81c18948db3a7f6e8e863d32511f75bf950e6a 
100644

--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,89 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, 
float32x4_t __a, float32x4_t __b,

    return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
  }

+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+typedef struct bfloat16x4x2_t
+{
+  bfloat16x4_t val[2];
+} bfloat16x4x2_t;


These should be in a new arm_bf16.h file that gets included in the 
main arm_neon.h file, right?

I believe the aarch64 versions are implemented that way.

Otherwise the patch looks good to me.
Thanks!
Kyrill


  +
+typedef struct bfloat16x8x2_t
+{
+  bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+



Re: ACLE intrinsics: BFloat16 store (vst{q}_bf16) intrinsics for AArch32

2020-02-21 Thread Delia Burduv

Hi Kyrill,

The arm_bf16.h is only used for scalar operations. That is how the 
aarch64 versions are implemented too.


Thanks,
Delia

On 2/21/20 2:06 PM, Kyrill Tkachov wrote:

Hi Delia,

On 2/19/20 5:25 PM, Delia Burduv wrote:

Hi,

Here is the latest version of the patch. It just has some minor
formatting changes that were brought up by Richard Sandiford in the
AArch64 patches

Thanks,
Delia

On 1/22/20 5:29 PM, Delia Burduv wrote:
> Ping.
>
> I will change the tests to use the exact input and output registers as
> Richard Sandiford suggested for the AArch64 patches.
>
> On 12/20/19 6:46 PM, Delia Burduv wrote:
>> This patch adds the ARMv8.6 ACLE BFloat16 store intrinsics
>> vst{q}_bf16 as part of the BFloat16 extension.
>> 
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) 


>>
>> The intrinsics are declared in arm_neon.h .
>> A new test is added to check assembler output.
>>
>> This patch depends on the Arm back-end patche.
>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>>
>> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't
>> have commit rights, so if this is ok can someone please commit it 
for me?

>>
>> gcc/ChangeLog:
>>
>> 2019-11-14  Delia Burduv 
>>
>>  * config/arm/arm_neon.h (bfloat16_t): New typedef.
>>  (bfloat16x4x2_t): New typedef.
>>  (bfloat16x8x2_t): New typedef.
>>  (bfloat16x4x3_t): New typedef.
>>  (bfloat16x8x3_t): New typedef.
>>  (bfloat16x4x4_t): New typedef.
>>  (bfloat16x8x4_t): New typedef.
>>  (vst2_bf16): New.
>>  (vst2q_bf16): New.
>>  (vst3_bf16): New.
>>  (vst3q_bf16): New.
>>  (vst4_bf16): New.
>>  (vst4q_bf16): New.
>>  * config/arm/arm-builtins.c (E_V2BFmode): New mode.
>>  (VAR13): New.
>>  (arm_simd_types[Bfloat16x2_t]):New type.
>>  * config/arm/arm-modes.def (V2BF): New mode.
>>  * config/arm/arm-simd-builtin-types.def
>>  (Bfloat16x2_t): New entry.
>>  * config/arm/arm_neon_builtins.def
>>  (vst2): Changed to VAR13 and added v4bf, v8bf
>>  (vst3): Changed to VAR13 and added v4bf, v8bf
>>  (vst4): Changed to VAR13 and added v4bf, v8bf
>>  * config/arm/iterators.md (VDXBF): New iterator.
>>  (VQ2BF): New iterator.
>>  (V_elem): Added V4BF, V8BF.
>>  (V_sz_elem): Added V4BF, V8BF.
>>  (V_mode_nunits): Added V4BF, V8BF.
>>  (q): Added V4BF, V8BF.
>>  *config/arm/neon.md (vst2): Used new iterators.
>>      (vst3): Used new iterators.
>>  (vst3qa): Used new iterators.
>>  (vst3qb): Used new iterators.
>>  (vst4): Used new iterators.
>>  (vst4qa): Used new iterators.
>>  (vst4qb): Used new iterators.
>>
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2019-11-14  Delia Burduv 
>>
>>  * gcc.target/arm/simd/bf16_vstn_1.c: New test.


One thing I just noticed in this and the other arm bfloat16 patches...

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 
3c78f435009ab027f92693d00ab5b40960d5419d..fd81c18948db3a7f6e8e863d32511f75bf950e6a 
100644

--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,89 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, 
float32x4_t __a, float32x4_t __b,

    return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
  }

+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+typedef struct bfloat16x4x2_t
+{
+  bfloat16x4_t val[2];
+} bfloat16x4x2_t;


These should be in a new arm_bf16.h file that gets included in the main 
arm_neon.h file, right?

I believe the aarch64 versions are implemented that way.

Otherwise the patch looks good to me.
Thanks!
Kyrill


  +
+typedef struct bfloat16x8x2_t
+{
+  bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+



Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32

2020-02-19 Thread Delia Burduv


Hi,

Here is the latest version of the patch. It just has some minor 
formatting changes that were brought up by Richard Sandiford in the 
AArch64 patches


Thanks,
Delia

On 1/22/20 5:31 PM, Delia Burduv wrote:

Ping.

I will change the tests to use the exact input and output registers as 
Richard Sandiford suggested for the AArch64 patches.


On 12/20/19 6:48 PM, Delia Burduv wrote:
This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics 
vld{q}_bf16 as part of the BFloat16 extension.
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) 


The intrinsics are declared in arm_neon.h .
A new test is added to check assembler output.

This patch depends on the Arm back-end patche. 
(https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)


Tested for regression on arm-none-eabi and armeb-none-eabi. I don't 
have commit rights, so if this is ok can someone please commit it for me?


gcc/ChangeLog:

2019-11-14  Delia Burduv  

 * config/arm/arm_neon.h (bfloat16_t): New typedef.
 (bfloat16x4x2_t): New typedef.
 (bfloat16x8x2_t): New typedef.
 (bfloat16x4x3_t): New typedef.
 (bfloat16x8x3_t): New typedef.
 (bfloat16x4x4_t): New typedef.
 (bfloat16x8x4_t): New typedef.
 (vld2_bf16): New.
 (vld2q_bf16): New.
 (vld3_bf16): New.
 (vld3q_bf16): New.
 (vld4_bf16): New.
 (vld4q_bf16): New.
 (vld2_dup_bf16): New.
 (vld2q_dup_bf16): New.
  (vld3_dup_bf16): New.
 (vld3q_dup_bf16): New.
 (vld4_dup_bf16): New.
 (vld4q_dup_bf16): New.
 * config/arm/arm-builtins.c (E_V2BFmode): New mode.
 (VAR13): New.
 (arm_simd_types[Bfloat16x2_t]):New type.
 * config/arm/arm-modes.def (V2BF): New mode.
 * config/arm/arm-simd-builtin-types.def
 (Bfloat16x2_t): New entry.
 * config/arm/arm_neon_builtins.def
 (vld2): Changed to VAR13 and added v4bf, v8bf
 (vld2_dup): Changed to VAR8 and added v4bf, v8bf
 (vld3): Changed to VAR13 and added v4bf, v8bf
 (vld3_dup): Changed to VAR8 and added v4bf, v8bf
 (vld4): Changed to VAR13 and added v4bf, v8bf
 (vld4_dup): Changed to VAR8 and added v4bf, v8bf
 * config/arm/iterators.md (VDXBF): New iterator.
 (VQ2BF): New iterator.
 (V_elem): Added V4BF, V8BF.
 (V_sz_elem): Added V4BF, V8BF.
 (V_mode_nunits): Added V4BF, V8BF.
 (q): Added V4BF, V8BF.
 *config/arm/neon.md (vld2): Used new iterators.
 (vld2_dup): Used new iterators.
 (vld2_dupv8bf): New.
 (vst3): Used new iterators.
 (vst3qa): Used new iterators.
 (vst3qb): Used new iterators.
 (vld3_dup): Used new iterators.
 (vld3_dupv8bf): New.
 (vst4): Used new iterators.
 (vst4qa): Used new iterators.
 (vst4qb): Used new iterators.
 (vld4_dup): Used new iterators.
 (vld4_dupv8bf): New.


gcc/testsuite/ChangeLog:

2019-11-14  Delia Burduv  

 * gcc.target/arm/simd/bf16_vldn_1.c: New test.
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 7f279cca6688c6f11948159666ee647ae533c61d..44c6f46fd63d5eaa1c3c84340d9acd017bb663e4 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -318,6 +318,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define v4bf_UP  E_V4BFmode
 #define v2si_UP  E_V2SImode
 #define v2sf_UP  E_V2SFmode
+#define v2bf_UP  E_V2BFmode
 #define di_UPE_DImode
 #define v16qi_UP E_V16QImode
 #define v8hi_UP  E_V8HImode
@@ -381,6 +382,9 @@ typedef struct {
 #define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
   VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \
   VAR1 (T, N, L)
+#define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR1 (T, N, M)
 
 /* The builtin data can be found in arm_neon_builtins.def, arm_vfp_builtins.def
and arm_acle_builtins.def.  The entries in arm_neon_builtins.def require
@@ -1013,6 +1017,7 @@ arm_init_simd_builtin_types (void)
   arm_simd_types[Float32x4_t].eltype = float_type_node;
 
   /* Init Bfloat vector types with underlying __bf16 scalar type.  */
+  arm_simd_types[Bfloat16x2_t].eltype = arm_bf16_type_node;
   arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node;
   arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node;
 
diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def
index ea92ef35723f979c8bb1f6bfb4fbeb6cd1e4b6e9..6e48223b63d98fcbe38960700dd0949d74629f7f 100644
--- a/gcc/config/arm/arm-modes.def
+++ b/gcc/config/arm/arm-modes.def
@@ -80,6 +80,7 @@ VECTOR_MODE (FLOAT, HF, 2);   /* V2HF */
 
 FLOAT_MODE (BF, 2, 0);
 ADJUST_FLOAT_FORMAT (BF, _bfloat_half_format);
+VECTOR_MODE (FLOAT, BF, 2);   /* V2BF.  */
 VECTOR_MODE (FLOAT, BF, 4);   /*		 V4BF.  */
 VECTOR_MODE (FLOAT, BF, 8);   /*		 V8BF.  */
 
diff --git a/gcc/config

Re: ACLE intrinsics: BFloat16 store (vst{q}_bf16) intrinsics for AArch32

2020-02-19 Thread Delia Burduv

Hi,

Here is the latest version of the patch. It just has some minor 
formatting changes that were brought up by Richard Sandiford in the 
AArch64 patches


Thanks,
Delia

On 1/22/20 5:29 PM, Delia Burduv wrote:

Ping.

I will change the tests to use the exact input and output registers as 
Richard Sandiford suggested for the AArch64 patches.


On 12/20/19 6:46 PM, Delia Burduv wrote:
This patch adds the ARMv8.6 ACLE BFloat16 store intrinsics 
vst{q}_bf16 as part of the BFloat16 extension.
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) 


The intrinsics are declared in arm_neon.h .
A new test is added to check assembler output.

This patch depends on the Arm back-end patche. 
(https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)


Tested for regression on arm-none-eabi and armeb-none-eabi. I don't 
have commit rights, so if this is ok can someone please commit it for me?


gcc/ChangeLog:

2019-11-14  Delia Burduv  

 * config/arm/arm_neon.h (bfloat16_t): New typedef.
 (bfloat16x4x2_t): New typedef.
 (bfloat16x8x2_t): New typedef.
 (bfloat16x4x3_t): New typedef.
 (bfloat16x8x3_t): New typedef.
 (bfloat16x4x4_t): New typedef.
 (bfloat16x8x4_t): New typedef.
 (vst2_bf16): New.
 (vst2q_bf16): New.
 (vst3_bf16): New.
 (vst3q_bf16): New.
 (vst4_bf16): New.
 (vst4q_bf16): New.
 * config/arm/arm-builtins.c (E_V2BFmode): New mode.
 (VAR13): New.
 (arm_simd_types[Bfloat16x2_t]):New type.
 * config/arm/arm-modes.def (V2BF): New mode.
 * config/arm/arm-simd-builtin-types.def
 (Bfloat16x2_t): New entry.
 * config/arm/arm_neon_builtins.def
 (vst2): Changed to VAR13 and added v4bf, v8bf
 (vst3): Changed to VAR13 and added v4bf, v8bf
 (vst4): Changed to VAR13 and added v4bf, v8bf
 * config/arm/iterators.md (VDXBF): New iterator.
 (VQ2BF): New iterator.
 (V_elem): Added V4BF, V8BF.
 (V_sz_elem): Added V4BF, V8BF.
 (V_mode_nunits): Added V4BF, V8BF.
 (q): Added V4BF, V8BF.
 *config/arm/neon.md (vst2): Used new iterators.
 (vst3): Used new iterators.
 (vst3qa): Used new iterators.
 (vst3qb): Used new iterators.
 (vst4): Used new iterators.
 (vst4qa): Used new iterators.
 (vst4qb): Used new iterators.


gcc/testsuite/ChangeLog:

2019-11-14  Delia Burduv  

 * gcc.target/arm/simd/bf16_vstn_1.c: New test.
diff --git a/gcc/config/arm/.arm_neon.h.swp b/gcc/config/arm/.arm_neon.h.swp
new file mode 100644
index ..faf4293da996b3345914320d783fef94bc49a773
Binary files /dev/null and b/gcc/config/arm/.arm_neon.h.swp differ
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 7f279cca6688c6f11948159666ee647ae533c61d..44c6f46fd63d5eaa1c3c84340d9acd017bb663e4 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -318,6 +318,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define v4bf_UP  E_V4BFmode
 #define v2si_UP  E_V2SImode
 #define v2sf_UP  E_V2SFmode
+#define v2bf_UP  E_V2BFmode
 #define di_UPE_DImode
 #define v16qi_UP E_V16QImode
 #define v8hi_UP  E_V8HImode
@@ -381,6 +382,9 @@ typedef struct {
 #define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
   VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \
   VAR1 (T, N, L)
+#define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR1 (T, N, M)
 
 /* The builtin data can be found in arm_neon_builtins.def, arm_vfp_builtins.def
and arm_acle_builtins.def.  The entries in arm_neon_builtins.def require
@@ -1013,6 +1017,7 @@ arm_init_simd_builtin_types (void)
   arm_simd_types[Float32x4_t].eltype = float_type_node;
 
   /* Init Bfloat vector types with underlying __bf16 scalar type.  */
+  arm_simd_types[Bfloat16x2_t].eltype = arm_bf16_type_node;
   arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node;
   arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node;
 
diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def
index ea92ef35723f979c8bb1f6bfb4fbeb6cd1e4b6e9..6e48223b63d98fcbe38960700dd0949d74629f7f 100644
--- a/gcc/config/arm/arm-modes.def
+++ b/gcc/config/arm/arm-modes.def
@@ -80,6 +80,7 @@ VECTOR_MODE (FLOAT, HF, 2);   /* V2HF */
 
 FLOAT_MODE (BF, 2, 0);
 ADJUST_FLOAT_FORMAT (BF, _bfloat_half_format);
+VECTOR_MODE (FLOAT, BF, 2);   /* V2BF.  */
 VECTOR_MODE (FLOAT, BF, 4);   /*		 V4BF.  */
 VECTOR_MODE (FLOAT, BF, 8);   /*		 V8BF.  */
 
diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def
index ea3c9f97b71f03ac28d83266bcdaddcd0d42678b..e35bb765cdf60b127f844877ca938dfb674ec16a 100644
--- a/gcc/config/arm/arm-simd-builtin-types.def
+++ b/gcc/config/arm/arm-simd-builtin-types.def
@@ -48,5 +48,6 @@
   ENTRY

Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma for AArch32 AdvSIMD

2020-02-19 Thread Delia Burduv

Hi,

Here is the latest version of the patch. It just has some minor 
formatting changes that were brought up by Richard Sandiford in the 
AArch64 patches


Thanks,
Delia

On 1/31/20 3:23 PM, Delia Burduv wrote:
Here is the updated patch. The changes are minor, so let me know if 
there is anything else to fix or if it can be committed.


Thank you,
Delia

On 1/30/20 2:55 PM, Kyrill Tkachov wrote:

Hi Delia,


On 1/28/20 4:44 PM, Delia Burduv wrote:

Ping.

*From:* Delia Burduv 
*Sent:* 22 January 2020 17:26
*To:* gcc-patches@gcc.gnu.org 
*Cc:* ni...@redhat.com ; Richard Earnshaw 
; Ramana Radhakrishnan 
; Kyrylo Tkachov 
*Subject:* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla 
and vfma for AArch32 AdvSIMD

Ping.

I have read Richard Sandiford's comments on the AArch64 patches and I
will apply what is relevant to this patch as well. Particularly, I will
change the tests to use the exact input and output registers and I will
change the types of the rtl patterns.



Please send the updated patches so that someone can commit them for 
you once they're reviewed.


Thanks,

Kyrill




On 12/20/19 6:44 PM, Delia Burduv wrote:
> This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat
> as part of the BFloat16 extension.
> (https://developer.arm.com/docs/101028/latest.)
> The intrinsics are declared in arm_neon.h and the RTL patterns are
> defined in neon.md.
> Two new tests are added to check assembler output and lane indices.
>
> This patch depends on the Arm back-end patche.
> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>
> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't 
have

> commit rights, so if this is ok can someone please commit it for me?
>
> gcc/ChangeLog:
>
> 2019-11-12� Delia Burduv 
>
>� ����* config/arm/arm_neon.h (vbfmmlaq_f32): New.
>� ����� (vbfmlalbq_f32): New.
>� ����� (vbfmlaltq_f32): New.
>� ����� (vbfmlalbq_lane_f32): New.
>� ����� (vbfmlaltq_lane_f32): New.
>� ������� (vbfmlalbq_laneq_f32): New.
>� ����� (vbfmlaltq_laneq_f32): New.
>� ����* config/arm/arm_neon_builtins.def (vbfmmla): New.
>� ��������� (vbfmab): New.
>� ��������� (vbfmat): New.
>� ��������� (vbfmab_lane): New.
>� ��������� (vbfmat_lane): New.
>� ��������� (vbfmab_laneq): New.
>� ��������� (vbfmat_laneq): New.
>� ���� * config/arm/iterators.md (BF_MA): New int iterator.
>� ��������� (bt): New int attribute.
>� ��������� (VQXBF): Copy of VQX with V8BF.
>� ��������� (V_HALF): Added V8BF.
>� ����� * config/arm/neon.md (neon_vbfmmlav8hi): New insn.
>� ��������� (neon_vbfmav8hi): New insn.
>� ��������� (neon_vbfma_lanev8hi): New insn.
>� ��������� (neon_vbfma_laneqv8hi): New expand.
>� ��������� (neon_vget_high): Changed iterator to 
VQXBF.
>� ����* config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
>� ��������� (UNSPEC_BFMAB): New UNSPEC.
>� ��������� (UNSPEC_BFMAT): New UNSPEC.
>
> 2019-11-12� Delia Burduv 
>
>� ������� * gcc.target/arm/simd/bf16_ma_1.c: New test.
>� ������� * gcc.target/arm/simd/bf16_ma_2.c: New test.
>� ������� * gcc.target/arm/simd/bf16_mmla_1.c: New test.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 3c78f435009ab027f92693d00ab5b40960d5419d..81f8008ea6a5fb11eb09f6685ba24bb0c54fb248 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,64 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b,
   return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
 }
 
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmmlav8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmabv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmatv8bf (__r, __a, __b);
+}
+
+__extens

Re: [GCC][PATCH][AArch64] ACLE intrinsics bfmmla and bfmlal for AArch64 AdvSIMD

2020-01-31 Thread Delia Burduv
Sure, here it is. I'll do that for the other patch too.

Thanks,
Delia

On 1/31/20 3:37 PM, Richard Sandiford wrote:
> Delia Burduv  writes:
>> Thank you, Richard!
>>
>> Here is the updated patch. The test that checks for errors when bf16 is
>> disabled is in the bfcvt patch.
> 
> Looks good.  Just a couple of very minor things...
> 
>>
>> Cheers,
>> Delia
>>
>> gcc/ChangeLog:
>>
>> 2019-11-06  Delia Burduv  
>>
>>   * config/aarch64/aarch64-simd-builtins.def
>>   (bfcvtn): New built-in function.
>>   (bfcvtn_q): New built-in function.
>>   (bfcvtn2): New built-in function.
>>   (bfcvt): New built-in function.
>>   * config/aarch64/aarch64-simd.md
>>   (aarch64_bfcvtn): New pattern.
>>   (aarch64_bfcvtn2v8bf): New pattern.
>>   (aarch64_bfcvtbf): New pattern.
>>   * config/aarch64/arm_bf16.h (float32_t): New typedef.
>>   (vcvth_bf16_f32): New intrinsic.
>>   * config/aarch64/arm_bf16.h (vcvt_bf16_f32): New intrinsic.
>>   (vcvtq_low_bf16_f32): New intrinsic.
>>   (vcvtq_high_bf16_f32): New intrinsic.
>>   * config/aarch64/iterators.md (V4SF_TO_BF): New mode iterator.
>>   (UNSPEC_BFCVTN): New UNSPEC.
>>   (UNSPEC_BFCVTN2): New UNSPEC.
>>   (UNSPEC_BFCVT): New UNSPEC.
>>   * config/arm/types.md (bf_cvt): New type.
> 
> The patch no longer changes types.md. :-)
> 
>> diff --git 
>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c 
>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
>> new file mode 100644
>> index 
>> ..9feb7ee7905cb14037427a36797fc67a6fa3fbc8
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
>> @@ -0,0 +1,67 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon } */
>> +/* { dg-additional-options "-save-temps" } */
>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
>> +
>> +#include 
>> +
>> +/*
>> +**test_bfmlalb:
>> +**  bfmlalb\tv0.4s, v1.8h, v2.8h
> 
> This version uses \t while the previous one used literal tabs.
> TBH I think the literal tab is nicer (and what we use for SVE FWIW).
> 
> OK with those changes, thanks.  Seems silly to ask when the changes
> are so trivial, but: please could you post an updated patch so that
> I can apply verbatim?
> 
> Richard
> 
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index a118f4f121de067c0a80f691b852247b0ab27f7a..02b2154cf64dad02cf57b110af51b19dd7f91c51 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -692,3 +692,14 @@
   VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
   VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
   VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
+
+  /* Implemented by aarch64_bfmmlaqv4sf  */
+  VAR1 (TERNOP, bfmmlaq, 0, v4sf)
+
+  /* Implemented by aarch64_bfmlal{_lane{q}}v4sf  */
+  VAR1 (TERNOP, bfmlalb, 0, v4sf)
+  VAR1 (TERNOP, bfmlalt, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_lane, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_lane, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, v4sf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 97f46f96968a6bc2f93bbc812931537b819b3b19..6ba72d7dc82ed02b5b5001a13ca896ab245a9d41 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7091,3 +7091,42 @@
 }
   [(set_attr "type" "neon_dot")]
 )
+
+;; bfmmla
+(define_insn "aarch64_bfmmlaqv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+(plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
+   (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+ (match_operand:V8BF 3 "register_operand" "w")]
+UNSPEC_BFMMLA)))]
+  "TARGET_BF16_SIMD"
+  "bfmmla\\t%0.4s, %2.8h, %3.8h"
+  [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+;; bfmlal
+(define_insn "aarch64_bfmlalv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+(plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+(

Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma for AArch32 AdvSIMD

2020-01-31 Thread Delia Burduv
Here is the updated patch. The changes are minor, so let me know if 
there is anything else to fix or if it can be committed.

Thank you,
Delia

On 1/30/20 2:55 PM, Kyrill Tkachov wrote:
> Hi Delia,
> 
> 
> On 1/28/20 4:44 PM, Delia Burduv wrote:
>> Ping.
>> ----
>> *From:* Delia Burduv 
>> *Sent:* 22 January 2020 17:26
>> *To:* gcc-patches@gcc.gnu.org 
>> *Cc:* ni...@redhat.com ; Richard Earnshaw 
>> ; Ramana Radhakrishnan 
>> ; Kyrylo Tkachov 
>> *Subject:* Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla 
>> and vfma for AArch32 AdvSIMD
>> Ping.
>>
>> I have read Richard Sandiford's comments on the AArch64 patches and I
>> will apply what is relevant to this patch as well. Particularly, I will
>> change the tests to use the exact input and output registers and I will
>> change the types of the rtl patterns.
> 
> 
> Please send the updated patches so that someone can commit them for you 
> once they're reviewed.
> 
> Thanks,
> 
> Kyrill
> 
> 
>>
>> On 12/20/19 6:44 PM, Delia Burduv wrote:
>> > This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat
>> > as part of the BFloat16 extension.
>> > (https://developer.arm.com/docs/101028/latest.)
>> > The intrinsics are declared in arm_neon.h and the RTL patterns are
>> > defined in neon.md.
>> > Two new tests are added to check assembler output and lane indices.
>> >
>> > This patch depends on the Arm back-end patche.
>> > (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>> >
>> > Tested for regression on arm-none-eabi and armeb-none-eabi. I don't 
>> have
>> > commit rights, so if this is ok can someone please commit it for me?
>> >
>> > gcc/ChangeLog:
>> >
>> > 2019-11-12  Delia Burduv 
>> >
>> >  * config/arm/arm_neon.h (vbfmmlaq_f32): New.
>> >    (vbfmlalbq_f32): New.
>> >    (vbfmlaltq_f32): New.
>> >    (vbfmlalbq_lane_f32): New.
>> >    (vbfmlaltq_lane_f32): New.
>> >      (vbfmlalbq_laneq_f32): New.
>> >    (vbfmlaltq_laneq_f32): New.
>> >  * config/arm/arm_neon_builtins.def (vbfmmla): New.
>> >    (vbfmab): New.
>> >    (vbfmat): New.
>> >    (vbfmab_lane): New.
>> >    (vbfmat_lane): New.
>> >    (vbfmab_laneq): New.
>> >    (vbfmat_laneq): New.
>> >   * config/arm/iterators.md (BF_MA): New int iterator.
>> >    (bt): New int attribute.
>> >    (VQXBF): Copy of VQX with V8BF.
>> >        (V_HALF): Added V8BF.
>> >    * config/arm/neon.md (neon_vbfmmlav8hi): New insn.
>> >    (neon_vbfmav8hi): New insn.
>> >    (neon_vbfma_lanev8hi): New insn.
>> >    (neon_vbfma_laneqv8hi): New expand.
>> >    (neon_vget_high): Changed iterator to VQXBF.
>> >  * config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
>> >    (UNSPEC_BFMAB): New UNSPEC.
>> >    (UNSPEC_BFMAT): New UNSPEC.
>> >
>> > 2019-11-12  Delia Burduv 
>> >
>> >      * gcc.target/arm/simd/bf16_ma_1.c: New test.
>> >      * gcc.target/arm/simd/bf16_ma_2.c: New test.
>> >      * gcc.target/arm/simd/bf16_mmla_1.c: New test.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 3c78f435009ab027f92693d00ab5b40960d5419d..81f8008ea6a5fb11eb09f6685ba24bb0c54fb248 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18742,6 +18742,64 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b,
   return __builtin_neon_vcmla_lane270v4sf (__r, __a, __b, __index);
 }
 
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmmlav8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmabv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmatv8bf (__r, __a, __b);
+}
+
+__extension__ extern 

Re: [GCC][PATCH][AArch64] ACLE intrinsics for BFCVTN, BFCVTN2 (AArch64 AdvSIMD) and BFCVT (AArch64 FP)

2020-01-31 Thread Delia Burduv
Sorry for the confusion, what I meant to say was:

This patch adds the Armv8.6-a ACLE intrinsics for bfcvtn, bfcvtn2 and 
bfcvt as part of the BFloat16 extension.
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
The intrinsics are declared in arm_bf16.h and arm_neon.h and the RTL 
patterns are defined in aarch64-simd.md.

Tested for regression on aarch64-none-elf and aarch64_be-none-elf. I 
don't have commit rights, so if this is ok can someone please commit it 
for me?

Here is the updated patch.

Thank you,
Delia


gcc/ChangeLog:

2019-11-06  Delia Burduv  

 * config/aarch64/aarch64-simd-builtins.def
 (bfcvtn): New built-in function.
 (bfcvtn_q): New built-in function.
 (bfcvtn2): New built-in function.
 (bfcvt): New built-in function.
 * config/aarch64/aarch64-simd.md
 (aarch64_bfcvtn): New pattern.
 (aarch64_bfcvtn2v8bf): New pattern.
 (aarch64_bfcvtbf): New pattern.
 * config/aarch64/arm_bf16.h (float32_t): New typedef.
 (vcvth_bf16_f32): New intrinsic.
 * config/aarch64/arm_bf16.h (vcvt_bf16_f32): New intrinsic.
 (vcvtq_low_bf16_f32): New intrinsic.
 (vcvtq_high_bf16_f32): New intrinsic.
 * config/aarch64/iterators.md (V4SF_TO_BF): New mode iterator.
 (UNSPEC_BFCVTN): New UNSPEC.
 (UNSPEC_BFCVTN2): New UNSPEC.
 (UNSPEC_BFCVT): New UNSPEC.
 * config/arm/types.md (bf_cvt): New type.


gcc/testsuite/ChangeLog:

2020-01-31  Delia Burduv  

 * gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c: New
test.
* gcc.target/aarch64/advsimd-intrinsics/bfcvt-nobf16.c: New
test.
* gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c: New
test.
* gcc.target/aarch64/advsimd-intrinsics/bfcvtnq2-untied.c: New
test.

On 12/23/19 6:30 PM, Richard Sandiford wrote:
> Some of the comments on the BFMMLA/BFMLA[LT] patch apply here too.
> 
> Delia Burduv  writes:
>> This patch adds the Armv8.6-a ACLE intrinsics for bfmmla, bfmlalb and
>> bfmlalt as part of the BFloat16 extension.
> 
> That's the other patch :-)
> 
>> [...]
>> diff --git a/gcc/config/aarch64/aarch64-simd.md 
>> b/gcc/config/aarch64/aarch64-simd.md
>> index 
>> 55660ae248f4fa75d35ba2949cd4b9d5139ff5f5..ff7a1f5f34a19b05eba48dba96c736dfdfdf7bac
>>  100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -7027,3 +7027,32 @@
>> "xtn\t%0., %1."
>> [(set_attr "type" "neon_shift_imm_narrow_q")]
>>   )
>> +
>> +;; bfcvtn
>> +(define_insn "aarch64_bfcvtn"
>> +  [(set (match_operand:V4SF_TO_BF 0 "register_operand" "=w")
>> +(unspec:V4SF_TO_BF [(match_operand:V4SF 1 "register_operand" "w")]
>> +UNSPEC_BFCVTN))]
>> +  "TARGET_BF16_SIMD"
>> +  "bfcvtn\\t%0.4h, %1.4s"
>> +  [(set_attr "type" "f_cvt")]
>> +)
>> +
> 
> If I've understood the naming convention correctly, the closest type
> seems to be "neon_fp_cvt_narrow_s_q".
> 
>> +(define_insn "aarch64_bfcvtn2v8bf"
>> +  [(set (match_operand:V8BF 0 "register_operand" "=w")
>> +(unspec:V8BF [(match_operand:V8BF 1 "register_operand" "w")
>> +  (match_operand:V4SF 2 "register_operand" "w")]
>> +  UNSPEC_BFCVTN2))]
>> +  "TARGET_BF16_SIMD"
>> +  "bfcvtn2\\t%0.8h, %2.4s"
>> +  [(set_attr "type" "f_cvt")]
>> +)
> 
> Same here.
> 
> The constraint on operand 1 needs to be "0", otherwise operands 1 and 0
> could end up in different registers.  You could test for this using
> something like:
> 
> bfloat16x8_t test_bfcvtnq2_untied (bfloat16x8_t unused, bfloat16x8_t inactive,
>  float32x4_t a)
> {
>return vcvtq_high_bf16_f32 (inactive, a);
> }
> 
> which when compiled at -O should produce something like:
> 
> /*
> **test_bfcvtnq2_untied:
> **mov v0\.8h, v1\.8h
> **bfcvtn2 v0\.8h, v2\.4s
> **ret
> */
> 
> (Completely untested, the code above is probably wrong.)
> 
>> +
>> +(define_insn "aarch64_bfcvtbf"
>> +  [(set (match_operand:BF 0 "register_operand" "=w")
>> +(unspec:BF [(match_operand:SF 1 "register_operand" "w")]
>> +UNSPEC_BFCVT))]
>> +  "TARGET_BF16_SIMD"
> 
> I think this just needs the scalar macro rather than *_SIM

Re: [GCC][PATCH][AArch64] ACLE intrinsics bfmmla and bfmlal for AArch64 AdvSIMD

2020-01-31 Thread Delia Burduv

Thank you, Richard!

Here is the updated patch. The test that checks for errors when bf16 is 
disabled is in the bfcvt patch.

Cheers,
Delia

gcc/ChangeLog:

2019-11-06  Delia Burduv  

 * config/aarch64/aarch64-simd-builtins.def
 (bfcvtn): New built-in function.
 (bfcvtn_q): New built-in function.
 (bfcvtn2): New built-in function.
 (bfcvt): New built-in function.
 * config/aarch64/aarch64-simd.md
 (aarch64_bfcvtn): New pattern.
 (aarch64_bfcvtn2v8bf): New pattern.
 (aarch64_bfcvtbf): New pattern.
 * config/aarch64/arm_bf16.h (float32_t): New typedef.
 (vcvth_bf16_f32): New intrinsic.
 * config/aarch64/arm_bf16.h (vcvt_bf16_f32): New intrinsic.
 (vcvtq_low_bf16_f32): New intrinsic.
 (vcvtq_high_bf16_f32): New intrinsic.
 * config/aarch64/iterators.md (V4SF_TO_BF): New mode iterator.
 (UNSPEC_BFCVTN): New UNSPEC.
 (UNSPEC_BFCVTN2): New UNSPEC.
 (UNSPEC_BFCVT): New UNSPEC.
 * config/arm/types.md (bf_cvt): New type.


gcc/testsuite/ChangeLog:

2019-11-06  Delia Burduv  

 * gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c: New
test.
 * gcc.target/aarch64/advsimd-intrinsics/bfcvt-nobf16.c: New
test.
 * gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c: New
test.
 * gcc.target/aarch64/advsimd-intrinsics/bfcvtnq2-untied.c: New
test.


On 12/23/19 6:11 PM, Richard Sandiford wrote:
> Thanks for the patch, looks good.
> 
> Delia Burduv  writes:
>> This patch adds the ARMv8.6 ACLE intrinsics for bfmmla, bfmlalb and bfmlalt 
>> as part of the BFloat16 extension.
>> (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
>> The intrinsics are declared in arm_neon.h and the RTL patterns are defined 
>> in aarch64-simd.md.
>> Two new tests are added to check assembler output.
>>
>> This patch depends on the two Aarch64 back-end patches. 
>> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01323.html and 
>> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01324.html)
>>
>> Tested for regression on aarch64-none-elf and aarch64_be-none-elf. I don't 
>> have commit rights, so if this is ok can someone please commit it for me?
>>
>> gcc/ChangeLog:
>>
>> 2019-10-29  Delia Burduv  
>>
>>  * config/aarch64/aarch64-simd-builtins.def
>>(bfmmla): New built-in function.
>>(bfmlalb): New built-in function.
>>(bfmlalt): New built-in function.
>>(bfmlalb_lane): New built-in function.
>>(bfmlalt_lane): New built-in function.
>>(bfmlalb_laneq): New built-in function.
>>(bfmlalt_laneq): New built-in function
>>  * config/aarch64/aarch64-simd.md (bfmmla): New pattern.
>>(bfmlal): New patterns.
>>  * config/aarch64/arm_neon.h (vbfmmlaq_f32): New intrinsic.
>>(vbfmlalbq_f32): New intrinsic.
>>(vbfmlaltq_f32): New intrinsic.
>>(vbfmlalbq_lane_f32): New intrinsic.
>>(vbfmlaltq_lane_f32): New intrinsic.
>>(vbfmlalbq_laneq_f32): New intrinsic.
>>(vbfmlaltq_laneq_f32): New intrinsic.
>>  * config/aarch64/iterators.md (UNSPEC_BFMMLA): New UNSPEC.
>>(UNSPEC_BFMLALB): New UNSPEC.
>>(UNSPEC_BFMLALT): New UNSPEC.
>>    (BF_MLA): New int iterator.
>>(bt): Added UNSPEC_BFMLALB, UNSPEC_BFMLALT.
>>  * config/arm/types.md (bf_mmla): New type.
>>(bf_mla): New type.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2019-10-29  Delia Burduv  
>>
>>  * gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c: New 
>> test.
>>  * gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c: New test.
>>  * 
>> gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c:
>>New test.
> 
> Formatting nit: continuation lines should only be indented by a tab,
> rather than a tab and two spaces.  (I agree the above looks nicer,
> but the policy is not to be flexible over this kind of thing...)
> 
>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
>> b/gcc/config/aarch64/aarch64-simd-builtins.def
>> index 
>> f4ca35a59704c761fe2ac2b6d401fff7c8aba80d..5e9f50f090870d0c63916540a48f5ac132d2630d
>>  100644
>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>> @@ -682,3 +682,14 @@
>> BUILTIN_VSFDF (UNOP, frint32x, 0)
>> BUILTIN_VSFDF (UNOP, frint64z, 0)
>>   

Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32

2020-01-28 Thread Delia Burduv
Ping.

From: Delia Burduv 
Sent: 22 January 2020 17:31
To: gcc-patches@gcc.gnu.org 
Cc: ni...@redhat.com ; Richard Earnshaw 
; Kyrylo Tkachov ; Ramana 
Radhakrishnan 
Subject: Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32

Ping.

I will change the tests to use the exact input and output registers as
Richard Sandiford suggested for the AArch64 patches.

On 12/20/19 6:48 PM, Delia Burduv wrote:
> This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics vld{q}_bf16
> as part of the BFloat16 extension.
> (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
>
> The intrinsics are declared in arm_neon.h .
> A new test is added to check assembler output.
>
> This patch depends on the Arm back-end patche.
> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>
> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have
> commit rights, so if this is ok can someone please commit it for me?
>
> gcc/ChangeLog:
>
> 2019-11-14  Delia Burduv  
>
>  * config/arm/arm_neon.h (bfloat16_t): New typedef.
>  (bfloat16x4x2_t): New typedef.
>  (bfloat16x8x2_t): New typedef.
>  (bfloat16x4x3_t): New typedef.
>  (bfloat16x8x3_t): New typedef.
>  (bfloat16x4x4_t): New typedef.
>  (bfloat16x8x4_t): New typedef.
>  (vld2_bf16): New.
>  (vld2q_bf16): New.
>  (vld3_bf16): New.
>  (vld3q_bf16): New.
>  (vld4_bf16): New.
>  (vld4q_bf16): New.
>  (vld2_dup_bf16): New.
>  (vld2q_dup_bf16): New.
>   (vld3_dup_bf16): New.
>  (vld3q_dup_bf16): New.
>  (vld4_dup_bf16): New.
>  (vld4q_dup_bf16): New.
>  * config/arm/arm-builtins.c (E_V2BFmode): New mode.
>  (VAR13): New.
>  (arm_simd_types[Bfloat16x2_t]):New type.
>  * config/arm/arm-modes.def (V2BF): New mode.
>  * config/arm/arm-simd-builtin-types.def
>  (Bfloat16x2_t): New entry.
>  * config/arm/arm_neon_builtins.def
>  (vld2): Changed to VAR13 and added v4bf, v8bf
>  (vld2_dup): Changed to VAR8 and added v4bf, v8bf
>  (vld3): Changed to VAR13 and added v4bf, v8bf
>  (vld3_dup): Changed to VAR8 and added v4bf, v8bf
>  (vld4): Changed to VAR13 and added v4bf, v8bf
>  (vld4_dup): Changed to VAR8 and added v4bf, v8bf
>  * config/arm/iterators.md (VDXBF): New iterator.
>  (VQ2BF): New iterator.
>  (V_elem): Added V4BF, V8BF.
>  (V_sz_elem): Added V4BF, V8BF.
>  (V_mode_nunits): Added V4BF, V8BF.
>  (q): Added V4BF, V8BF.
>  *config/arm/neon.md (vld2): Used new iterators.
>  (vld2_dup): Used new iterators.
>  (vld2_dupv8bf): New.
>  (vst3): Used new iterators.
>  (vst3qa): Used new iterators.
>  (vst3qb): Used new iterators.
>  (vld3_dup): Used new iterators.
>  (vld3_dupv8bf): New.
>  (vst4): Used new iterators.
>  (vst4qa): Used new iterators.
>  (vst4qb): Used new iterators.
>  (vld4_dup): Used new iterators.
>  (vld4_dupv8bf): New.
>
>
> gcc/testsuite/ChangeLog:
>
> 2019-11-14  Delia Burduv  
>
>  * gcc.target/arm/simd/bf16_vldn_1.c: New test.


Re: ACLE intrinsics: BFloat16 store (vst{q}_bf16) intrinsics for AArch32

2020-01-28 Thread Delia Burduv
Ping.

From: Delia Burduv 
Sent: 22 January 2020 17:29
To: gcc-patches@gcc.gnu.org 
Cc: ni...@redhat.com ; Richard Earnshaw 
; Kyrylo Tkachov ; Ramana 
Radhakrishnan 
Subject: Re: ACLE intrinsics: BFloat16 store (vst{q}_bf16) intrinsics for 
AArch32

Ping.

I will change the tests to use the exact input and output registers as
Richard Sandiford suggested for the AArch64 patches.

On 12/20/19 6:46 PM, Delia Burduv wrote:
> This patch adds the ARMv8.6 ACLE BFloat16 store intrinsics
> vst{q}_bf16 as part of the BFloat16 extension.
> (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
>
> The intrinsics are declared in arm_neon.h .
> A new test is added to check assembler output.
>
> This patch depends on the Arm back-end patche.
> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>
> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have
> commit rights, so if this is ok can someone please commit it for me?
>
> gcc/ChangeLog:
>
> 2019-11-14  Delia Burduv  
>
>  * config/arm/arm_neon.h (bfloat16_t): New typedef.
>  (bfloat16x4x2_t): New typedef.
>  (bfloat16x8x2_t): New typedef.
>  (bfloat16x4x3_t): New typedef.
>  (bfloat16x8x3_t): New typedef.
>  (bfloat16x4x4_t): New typedef.
>  (bfloat16x8x4_t): New typedef.
>  (vst2_bf16): New.
>  (vst2q_bf16): New.
>  (vst3_bf16): New.
>  (vst3q_bf16): New.
>  (vst4_bf16): New.
>  (vst4q_bf16): New.
>  * config/arm/arm-builtins.c (E_V2BFmode): New mode.
>  (VAR13): New.
>  (arm_simd_types[Bfloat16x2_t]):New type.
>  * config/arm/arm-modes.def (V2BF): New mode.
>  * config/arm/arm-simd-builtin-types.def
>  (Bfloat16x2_t): New entry.
>  * config/arm/arm_neon_builtins.def
>  (vst2): Changed to VAR13 and added v4bf, v8bf
>  (vst3): Changed to VAR13 and added v4bf, v8bf
>  (vst4): Changed to VAR13 and added v4bf, v8bf
>  * config/arm/iterators.md (VDXBF): New iterator.
>  (VQ2BF): New iterator.
>  (V_elem): Added V4BF, V8BF.
>  (V_sz_elem): Added V4BF, V8BF.
>  (V_mode_nunits): Added V4BF, V8BF.
>  (q): Added V4BF, V8BF.
>  *config/arm/neon.md (vst2): Used new iterators.
>  (vst3): Used new iterators.
>  (vst3qa): Used new iterators.
>  (vst3qb): Used new iterators.
>  (vst4): Used new iterators.
>  (vst4qa): Used new iterators.
>  (vst4qb): Used new iterators.
>
>
> gcc/testsuite/ChangeLog:
>
> 2019-11-14  Delia Burduv  
>
>  * gcc.target/arm/simd/bf16_vstn_1.c: New test.


Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma for AArch32 AdvSIMD

2020-01-28 Thread Delia Burduv
Ping.

From: Delia Burduv 
Sent: 22 January 2020 17:26
To: gcc-patches@gcc.gnu.org 
Cc: ni...@redhat.com ; Richard Earnshaw 
; Ramana Radhakrishnan 
; Kyrylo Tkachov 
Subject: Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma 
for AArch32 AdvSIMD

Ping.

I have read Richard Sandiford's comments on the AArch64 patches and I
will apply what is relevant to this patch as well. Particularly, I will
change the tests to use the exact input and output registers and I will
change the types of the rtl patterns.

On 12/20/19 6:44 PM, Delia Burduv wrote:
> This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat
> as part of the BFloat16 extension.
> (https://developer.arm.com/docs/101028/latest.)
> The intrinsics are declared in arm_neon.h and the RTL patterns are
> defined in neon.md.
> Two new tests are added to check assembler output and lane indices.
>
> This patch depends on the Arm back-end patche.
> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
>
> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have
> commit rights, so if this is ok can someone please commit it for me?
>
> gcc/ChangeLog:
>
> 2019-11-12  Delia Burduv  
>
>  * config/arm/arm_neon.h (vbfmmlaq_f32): New.
>(vbfmlalbq_f32): New.
>(vbfmlaltq_f32): New.
>(vbfmlalbq_lane_f32): New.
>(vbfmlaltq_lane_f32): New.
>  (vbfmlalbq_laneq_f32): New.
>(vbfmlaltq_laneq_f32): New.
>  * config/arm/arm_neon_builtins.def (vbfmmla): New.
>(vbfmab): New.
>(vbfmat): New.
>(vbfmab_lane): New.
>(vbfmat_lane): New.
>(vbfmab_laneq): New.
>(vbfmat_laneq): New.
>   * config/arm/iterators.md (BF_MA): New int iterator.
>(bt): New int attribute.
>(VQXBF): Copy of VQX with V8BF.
>(V_HALF): Added V8BF.
>* config/arm/neon.md (neon_vbfmmlav8hi): New insn.
>(neon_vbfmav8hi): New insn.
>(neon_vbfma_lanev8hi): New insn.
>(neon_vbfma_laneqv8hi): New expand.
>(neon_vget_high): Changed iterator to VQXBF.
>  * config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
>(UNSPEC_BFMAB): New UNSPEC.
>(UNSPEC_BFMAT): New UNSPEC.
>
> 2019-11-12  Delia Burduv  
>
>  * gcc.target/arm/simd/bf16_ma_1.c: New test.
>  * gcc.target/arm/simd/bf16_ma_2.c: New test.
>  * gcc.target/arm/simd/bf16_mmla_1.c: New test.


Re: ACLE intrinsics: BFloat16 load intrinsics for AArch32

2020-01-22 Thread Delia Burduv
Ping.

I will change the tests to use the exact input and output registers as 
Richard Sandiford suggested for the AArch64 patches.

On 12/20/19 6:48 PM, Delia Burduv wrote:
> This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics vld{q}_bf16 
> as part of the BFloat16 extension.
> (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
>  
> 
> The intrinsics are declared in arm_neon.h .
> A new test is added to check assembler output.
> 
> This patch depends on the Arm back-end patche. 
> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
> 
> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have 
> commit rights, so if this is ok can someone please commit it for me?
> 
> gcc/ChangeLog:
> 
> 2019-11-14  Delia Burduv  
> 
>  * config/arm/arm_neon.h (bfloat16_t): New typedef.
>      (bfloat16x4x2_t): New typedef.
>      (bfloat16x8x2_t): New typedef.
>      (bfloat16x4x3_t): New typedef.
>      (bfloat16x8x3_t): New typedef.
>      (bfloat16x4x4_t): New typedef.
>      (bfloat16x8x4_t): New typedef.
>      (vld2_bf16): New.
>  (vld2q_bf16): New.
>  (vld3_bf16): New.
>  (vld3q_bf16): New.
>  (vld4_bf16): New.
>  (vld4q_bf16): New.
>  (vld2_dup_bf16): New.
>  (vld2q_dup_bf16): New.
>   (vld3_dup_bf16): New.
>  (vld3q_dup_bf16): New.
>  (vld4_dup_bf16): New.
>  (vld4q_dup_bf16): New.
>      * config/arm/arm-builtins.c (E_V2BFmode): New mode.
>      (VAR13): New.
>      (arm_simd_types[Bfloat16x2_t]):New type.
>      * config/arm/arm-modes.def (V2BF): New mode.
>      * config/arm/arm-simd-builtin-types.def
>      (Bfloat16x2_t): New entry.
>      * config/arm/arm_neon_builtins.def
>      (vld2): Changed to VAR13 and added v4bf, v8bf
>      (vld2_dup): Changed to VAR8 and added v4bf, v8bf
>      (vld3): Changed to VAR13 and added v4bf, v8bf
>      (vld3_dup): Changed to VAR8 and added v4bf, v8bf
>      (vld4): Changed to VAR13 and added v4bf, v8bf
>      (vld4_dup): Changed to VAR8 and added v4bf, v8bf
>      * config/arm/iterators.md (VDXBF): New iterator.
>      (VQ2BF): New iterator.
>      (V_elem): Added V4BF, V8BF.
>      (V_sz_elem): Added V4BF, V8BF.
>      (V_mode_nunits): Added V4BF, V8BF.
>      (q): Added V4BF, V8BF.
>      *config/arm/neon.md (vld2): Used new iterators.
>      (vld2_dup): Used new iterators.
>      (vld2_dupv8bf): New.
>      (vst3): Used new iterators.
>      (vst3qa): Used new iterators.
>      (vst3qb): Used new iterators.
>      (vld3_dup): Used new iterators.
>      (vld3_dupv8bf): New.
>      (vst4): Used new iterators.
>      (vst4qa): Used new iterators.
>      (vst4qb): Used new iterators.
>      (vld4_dup): Used new iterators.
>      (vld4_dupv8bf): New.
> 
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-11-14  Delia Burduv  
> 
>  * gcc.target/arm/simd/bf16_vldn_1.c: New test.


Re: ACLE intrinsics: BFloat16 store (vst{q}_bf16) intrinsics for AArch32

2020-01-22 Thread Delia Burduv
Ping.

I will change the tests to use the exact input and output registers as 
Richard Sandiford suggested for the AArch64 patches.

On 12/20/19 6:46 PM, Delia Burduv wrote:
> This patch adds the ARMv8.6 ACLE BFloat16 store intrinsics 
> vst{q}_bf16 as part of the BFloat16 extension.
> (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
>  
> 
> The intrinsics are declared in arm_neon.h .
> A new test is added to check assembler output.
> 
> This patch depends on the Arm back-end patche. 
> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
> 
> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have 
> commit rights, so if this is ok can someone please commit it for me?
> 
> gcc/ChangeLog:
> 
> 2019-11-14  Delia Burduv  
> 
>  * config/arm/arm_neon.h (bfloat16_t): New typedef.
>      (bfloat16x4x2_t): New typedef.
>      (bfloat16x8x2_t): New typedef.
>      (bfloat16x4x3_t): New typedef.
>      (bfloat16x8x3_t): New typedef.
>      (bfloat16x4x4_t): New typedef.
>      (bfloat16x8x4_t): New typedef.
>      (vst2_bf16): New.
>  (vst2q_bf16): New.
>  (vst3_bf16): New.
>  (vst3q_bf16): New.
>  (vst4_bf16): New.
>  (vst4q_bf16): New.
>      * config/arm/arm-builtins.c (E_V2BFmode): New mode.
>      (VAR13): New.
>      (arm_simd_types[Bfloat16x2_t]):New type.
>      * config/arm/arm-modes.def (V2BF): New mode.
>      * config/arm/arm-simd-builtin-types.def
>      (Bfloat16x2_t): New entry.
>      * config/arm/arm_neon_builtins.def
>      (vst2): Changed to VAR13 and added v4bf, v8bf
>      (vst3): Changed to VAR13 and added v4bf, v8bf
>      (vst4): Changed to VAR13 and added v4bf, v8bf
>      * config/arm/iterators.md (VDXBF): New iterator.
>      (VQ2BF): New iterator.
>      (V_elem): Added V4BF, V8BF.
>      (V_sz_elem): Added V4BF, V8BF.
>      (V_mode_nunits): Added V4BF, V8BF.
>      (q): Added V4BF, V8BF.
>      *config/arm/neon.md (vst2): Used new iterators.
>      (vst3): Used new iterators.
>      (vst3qa): Used new iterators.
>      (vst3qb): Used new iterators.
>      (vst4): Used new iterators.
>      (vst4qa): Used new iterators.
>      (vst4qb): Used new iterators.
> 
> 
> gcc/testsuite/ChangeLog:
> 
> 2019-11-14  Delia Burduv  
> 
>  * gcc.target/arm/simd/bf16_vstn_1.c: New test.


Re: [GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma for AArch32 AdvSIMD

2020-01-22 Thread Delia Burduv
Ping.

I have read Richard Sandiford's comments on the AArch64 patches and I 
will apply what is relevant to this patch as well. Particularly, I will 
change the tests to use the exact input and output registers and I will 
change the types of the rtl patterns.

On 12/20/19 6:44 PM, Delia Burduv wrote:
> This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat 
> as part of the BFloat16 extension.
> (https://developer.arm.com/docs/101028/latest.)
> The intrinsics are declared in arm_neon.h and the RTL patterns are 
> defined in neon.md.
> Two new tests are added to check assembler output and lane indices.
> 
> This patch depends on the Arm back-end patche. 
> (https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)
> 
> Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have 
> commit rights, so if this is ok can someone please commit it for me?
> 
> gcc/ChangeLog:
> 
> 2019-11-12  Delia Burduv  
> 
>  * config/arm/arm_neon.h (vbfmmlaq_f32): New.
>    (vbfmlalbq_f32): New.
>    (vbfmlaltq_f32): New.
>    (vbfmlalbq_lane_f32): New.
>    (vbfmlaltq_lane_f32): New.
>      (vbfmlalbq_laneq_f32): New.
>    (vbfmlaltq_laneq_f32): New.
>  * config/arm/arm_neon_builtins.def (vbfmmla): New.
>    (vbfmab): New.
>    (vbfmat): New.
>    (vbfmab_lane): New.
>    (vbfmat_lane): New.
>    (vbfmab_laneq): New.
>    (vbfmat_laneq): New.
>   * config/arm/iterators.md (BF_MA): New int iterator.
>    (bt): New int attribute.
>    (VQXBF): Copy of VQX with V8BF.
>    (V_HALF): Added V8BF.
>    * config/arm/neon.md (neon_vbfmmlav8hi): New insn.
>    (neon_vbfmav8hi): New insn.
>    (neon_vbfma_lanev8hi): New insn.
>    (neon_vbfma_laneqv8hi): New expand.
>    (neon_vget_high): Changed iterator to VQXBF.
>  * config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
>    (UNSPEC_BFMAB): New UNSPEC.
>    (UNSPEC_BFMAT): New UNSPEC.
> 
> 2019-11-12  Delia Burduv  
> 
>      * gcc.target/arm/simd/bf16_ma_1.c: New test.
>      * gcc.target/arm/simd/bf16_ma_2.c: New test.
>      * gcc.target/arm/simd/bf16_mmla_1.c: New test.


ACLE intrinsics: BFloat16 load intrinsics for AArch32

2019-12-20 Thread Delia Burduv
This patch adds the ARMv8.6 ACLE BFloat16 load intrinsics vld{q}_bf16 
as part of the BFloat16 extension.
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
The intrinsics are declared in arm_neon.h .
A new test is added to check assembler output.

This patch depends on the Arm back-end patche. 
(https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)

Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have 
commit rights, so if this is ok can someone please commit it for me?

gcc/ChangeLog:

2019-11-14  Delia Burduv  

* config/arm/arm_neon.h (bfloat16_t): New typedef.
 (bfloat16x4x2_t): New typedef.
 (bfloat16x8x2_t): New typedef.
 (bfloat16x4x3_t): New typedef.
 (bfloat16x8x3_t): New typedef.
 (bfloat16x4x4_t): New typedef.
 (bfloat16x8x4_t): New typedef.
 (vld2_bf16): New.
(vld2q_bf16): New.
(vld3_bf16): New.
(vld3q_bf16): New.
(vld4_bf16): New.
(vld4q_bf16): New.
(vld2_dup_bf16): New.
(vld2q_dup_bf16): New.
(vld3_dup_bf16): New.
(vld3q_dup_bf16): New.
(vld4_dup_bf16): New.
(vld4q_dup_bf16): New.
 * config/arm/arm-builtins.c (E_V2BFmode): New mode.
 (VAR13): New.
 (arm_simd_types[Bfloat16x2_t]):New type.
 * config/arm/arm-modes.def (V2BF): New mode.
 * config/arm/arm-simd-builtin-types.def
 (Bfloat16x2_t): New entry.
 * config/arm/arm_neon_builtins.def
 (vld2): Changed to VAR13 and added v4bf, v8bf
 (vld2_dup): Changed to VAR8 and added v4bf, v8bf
 (vld3): Changed to VAR13 and added v4bf, v8bf
 (vld3_dup): Changed to VAR8 and added v4bf, v8bf
 (vld4): Changed to VAR13 and added v4bf, v8bf
 (vld4_dup): Changed to VAR8 and added v4bf, v8bf
 * config/arm/iterators.md (VDXBF): New iterator.
 (VQ2BF): New iterator.
 (V_elem): Added V4BF, V8BF.
 (V_sz_elem): Added V4BF, V8BF.
 (V_mode_nunits): Added V4BF, V8BF.
 (q): Added V4BF, V8BF.
 *config/arm/neon.md (vld2): Used new iterators.
 (vld2_dup): Used new iterators.
 (vld2_dupv8bf): New.
 (vst3): Used new iterators.
 (vst3qa): Used new iterators.
 (vst3qb): Used new iterators.
 (vld3_dup): Used new iterators.
 (vld3_dupv8bf): New.
 (vst4): Used new iterators.
 (vst4qa): Used new iterators.
 (vst4qb): Used new iterators.
 (vld4_dup): Used new iterators.
 (vld4_dupv8bf): New.


gcc/testsuite/ChangeLog:

2019-11-14  Delia Burduv  

* gcc.target/arm/simd/bf16_vldn_1.c: New test.
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index df09a6bb1fce5f9216337d71cba51a890fd57baf..551d76a44fadc58a35a6155486ec1fb16c959da0 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -318,6 +318,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define v4bf_UP  E_V4BFmode
 #define v2si_UP  E_V2SImode
 #define v2sf_UP  E_V2SFmode
+#define v2bf_UP  E_V2BFmode
 #define di_UPE_DImode
 #define v16qi_UP E_V16QImode
 #define v8hi_UP  E_V8HImode
@@ -381,6 +382,9 @@ typedef struct {
 #define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
   VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \
   VAR1 (T, N, L)
+#define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR1 (T, N, M)
 
 /* The builtin data can be found in arm_neon_builtins.def, arm_vfp_builtins.def
and arm_acle_builtins.def.  The entries in arm_neon_builtins.def require
@@ -1013,6 +1017,7 @@ arm_init_simd_builtin_types (void)
   arm_simd_types[Float32x4_t].eltype = float_type_node;
 
   /* Init Bfloat vector types with underlying __bf16 scalar type.  */
+  arm_simd_types[Bfloat16x2_t].eltype = arm_bf16_type_node;
   arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node;
   arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node;
 
diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def
index 80c3c1a6eb258d116b07ad71fafafc9befb76e8b..9533d177059d98fa2a9e9d1d6321f3d92dad7592 100644
--- a/gcc/config/arm/arm-modes.def
+++ b/gcc/config/arm/arm-modes.def
@@ -80,6 +80,7 @@ VECTOR_MODE (FLOAT, HF, 2);   /* V2HF */
 
 FLOAT_MODE (BF, 2, 0);
 ADJUST_FLOAT_FORMAT (BF, _bfloat_half_format);
+VECTOR_MODE (FLOAT, BF, 2);   /* V2BF.  */
 VECTOR_MODE (FLOAT, BF, 4);   /*		 V4BF.  */
 VECTOR_MODE (FLOAT, BF, 8);   /*		 V8BF.  */
 
diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def
index ee240f85c5618417fff039ec43b81641b187c126..f52f679156d5041ab109909393dc37fda33a390d 100644
--- a/gcc/config/arm/arm-simd-builtin-types.def
+++ b/gcc/config/arm/arm-simd-builtin-types.def
@@ -48,5 +48,6 @@
   ENTRY (Float16x8_t, V8HF, none, 128, float16, 19)
   ENTRY (Float32x4_t

ACLE intrinsics: BFloat16 store (vst{q}_bf16) intrinsics for AArch32

2019-12-20 Thread Delia Burduv
This patch adds the ARMv8.6 ACLE BFloat16 store intrinsics 
vst{q}_bf16 as part of the BFloat16 extension.
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
The intrinsics are declared in arm_neon.h .
A new test is added to check assembler output.

This patch depends on the Arm back-end patche. 
(https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)

Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have 
commit rights, so if this is ok can someone please commit it for me?

gcc/ChangeLog:

2019-11-14  Delia Burduv  

* config/arm/arm_neon.h (bfloat16_t): New typedef.
 (bfloat16x4x2_t): New typedef.
 (bfloat16x8x2_t): New typedef.
 (bfloat16x4x3_t): New typedef.
 (bfloat16x8x3_t): New typedef.
 (bfloat16x4x4_t): New typedef.
 (bfloat16x8x4_t): New typedef.
 (vst2_bf16): New.
(vst2q_bf16): New.
(vst3_bf16): New.
(vst3q_bf16): New.
(vst4_bf16): New.
(vst4q_bf16): New.
 * config/arm/arm-builtins.c (E_V2BFmode): New mode.
 (VAR13): New.
 (arm_simd_types[Bfloat16x2_t]):New type.
 * config/arm/arm-modes.def (V2BF): New mode.
 * config/arm/arm-simd-builtin-types.def
 (Bfloat16x2_t): New entry.
 * config/arm/arm_neon_builtins.def
 (vst2): Changed to VAR13 and added v4bf, v8bf
 (vst3): Changed to VAR13 and added v4bf, v8bf
 (vst4): Changed to VAR13 and added v4bf, v8bf
 * config/arm/iterators.md (VDXBF): New iterator.
 (VQ2BF): New iterator.
 (V_elem): Added V4BF, V8BF.
 (V_sz_elem): Added V4BF, V8BF.
 (V_mode_nunits): Added V4BF, V8BF.
 (q): Added V4BF, V8BF.
 *config/arm/neon.md (vst2): Used new iterators.
 (vst3): Used new iterators.
 (vst3qa): Used new iterators.
 (vst3qb): Used new iterators.
 (vst4): Used new iterators.
 (vst4qa): Used new iterators.
 (vst4qb): Used new iterators.


gcc/testsuite/ChangeLog:

2019-11-14  Delia Burduv  

* gcc.target/arm/simd/bf16_vstn_1.c: New test.
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index df09a6bb1fce5f9216337d71cba51a890fd57baf..551d76a44fadc58a35a6155486ec1fb16c959da0 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -318,6 +318,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define v4bf_UP  E_V4BFmode
 #define v2si_UP  E_V2SImode
 #define v2sf_UP  E_V2SFmode
+#define v2bf_UP  E_V2BFmode
 #define di_UPE_DImode
 #define v16qi_UP E_V16QImode
 #define v8hi_UP  E_V8HImode
@@ -381,6 +382,9 @@ typedef struct {
 #define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
   VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \
   VAR1 (T, N, L)
+#define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR1 (T, N, M)
 
 /* The builtin data can be found in arm_neon_builtins.def, arm_vfp_builtins.def
and arm_acle_builtins.def.  The entries in arm_neon_builtins.def require
@@ -1013,6 +1017,7 @@ arm_init_simd_builtin_types (void)
   arm_simd_types[Float32x4_t].eltype = float_type_node;
 
   /* Init Bfloat vector types with underlying __bf16 scalar type.  */
+  arm_simd_types[Bfloat16x2_t].eltype = arm_bf16_type_node;
   arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node;
   arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node;
 
diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def
index 80c3c1a6eb258d116b07ad71fafafc9befb76e8b..9533d177059d98fa2a9e9d1d6321f3d92dad7592 100644
--- a/gcc/config/arm/arm-modes.def
+++ b/gcc/config/arm/arm-modes.def
@@ -80,6 +80,7 @@ VECTOR_MODE (FLOAT, HF, 2);   /* V2HF */
 
 FLOAT_MODE (BF, 2, 0);
 ADJUST_FLOAT_FORMAT (BF, _bfloat_half_format);
+VECTOR_MODE (FLOAT, BF, 2);   /* V2BF.  */
 VECTOR_MODE (FLOAT, BF, 4);   /*		 V4BF.  */
 VECTOR_MODE (FLOAT, BF, 8);   /*		 V8BF.  */
 
diff --git a/gcc/config/arm/arm-simd-builtin-types.def b/gcc/config/arm/arm-simd-builtin-types.def
index ee240f85c5618417fff039ec43b81641b187c126..f52f679156d5041ab109909393dc37fda33a390d 100644
--- a/gcc/config/arm/arm-simd-builtin-types.def
+++ b/gcc/config/arm/arm-simd-builtin-types.def
@@ -48,5 +48,6 @@
   ENTRY (Float16x8_t, V8HF, none, 128, float16, 19)
   ENTRY (Float32x4_t, V4SF, none, 128, float32, 19)
 
+  ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20)
   ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20)
   ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20)
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 71e7568e4315a9354062dee5442ca4af9d9660a9..2bed33800facb65c20ea95646a5c4053dd5673de 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -91,6 +91,85 @@ typedef float float32_t;
 #ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
 typedef __simd128_bfloat16_t bfloat16x8_t;
 typedef

[GCC][PATCH][AArch32] ACLE intrinsics bfloat16 vmmla and vfma for AArch32 AdvSIMD

2019-12-20 Thread Delia Burduv
This patch adds the ARMv8.6 ACLE intrinsics for vmmla, vfmab and vfmat 
as part of the BFloat16 extension.
(https://developer.arm.com/docs/101028/latest.)
The intrinsics are declared in arm_neon.h and the RTL patterns are 
defined in neon.md.
Two new tests are added to check assembler output and lane indices.

This patch depends on the Arm back-end patche. 
(https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html)

Tested for regression on arm-none-eabi and armeb-none-eabi. I don't have 
commit rights, so if this is ok can someone please commit it for me?

gcc/ChangeLog:

2019-11-12  Delia Burduv  

* config/arm/arm_neon.h (vbfmmlaq_f32): New.
  (vbfmlalbq_f32): New.
  (vbfmlaltq_f32): New.
  (vbfmlalbq_lane_f32): New.
  (vbfmlaltq_lane_f32): New.
  (vbfmlalbq_laneq_f32): New.
  (vbfmlaltq_laneq_f32): New.
* config/arm/arm_neon_builtins.def (vbfmmla): New.
   (vbfmab): New.
   (vbfmat): New.
   (vbfmab_lane): New.
   (vbfmat_lane): New.
   (vbfmab_laneq): New.
   (vbfmat_laneq): New.
* config/arm/iterators.md (BF_MA): New int iterator.
   (bt): New int attribute.
   (VQXBF): Copy of VQX with V8BF.
   (V_HALF): Added V8BF.
* config/arm/neon.md (neon_vbfmmlav8hi): New insn.
   (neon_vbfmav8hi): New insn.
   (neon_vbfma_lanev8hi): New insn.
   (neon_vbfma_laneqv8hi): New expand.
   (neon_vget_high): Changed iterator to VQXBF.
* config/arm/unspecs.md (UNSPEC_BFMMLA): New UNSPEC.
   (UNSPEC_BFMAB): New UNSPEC.
   (UNSPEC_BFMAT): New UNSPEC.

2019-11-12  Delia Burduv  

 * gcc.target/arm/simd/bf16_ma_1.c: New test.
 * gcc.target/arm/simd/bf16_ma_2.c: New test.
 * gcc.target/arm/simd/bf16_mmla_1.c: New test.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 71e7568e4315a9354062dee5442ca4af9d9660a9..097d7bb30ad0109ca2f41885206b1cfb2ce962dc 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -91,6 +91,60 @@ typedef float float32_t;
 #ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
 typedef __simd128_bfloat16_t bfloat16x8_t;
 typedef __simd64_bfloat16_t bfloat16x4_t;
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmmlav8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmabv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfmatv8bf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		const int __index)
+{
+  return __builtin_neon_vbfmab_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		const int __index)
+{
+  return __builtin_neon_vbfmat_lanev8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		 const int __index)
+{
+  return __builtin_neon_vbfmab_laneqv8bf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		 const int __index)
+{
+  return __builtin_neon_vbfmat_laneqv8bf (__r, __a, __b, __index);
+}
+
 #endif
 #pragma GCC pop_options
 #pragma GCC pop_options
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index bcccf93f7fa2750e9006e5856efecbec0fb331b9..169781fa9a07930eb755165019427be055dc36ef 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -373,3 +373,12 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane0, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane90, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane180, v4sf, v8hf)
 VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
+
+VAR1 (TERNOP, vbfmmla, v8bf)
+
+VAR1 (TERNOP, vbfmab, v8bf)
+VAR1 (TERNOP, vbfmat, v8bf)
+VAR1 (MAC_LANE, vbfmab_lane, v8bf)
+VAR1 (MAC_LANE, vbfmat_lane, v8bf)
+VAR1 (MAC_LANE, vbfmab_laneq, v8bf)
+VAR1 (MAC_LANE, vbfmat_laneq, v8bf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm

[GCC][PATCH][AArch64] ACLE intrinsics for BFCVTN, BFCVTN2 (AArch64 AdvSIMD) and BFCVT (AArch64 FP)

2019-12-20 Thread Delia Burduv
This patch adds the Armv8.6-a ACLE intrinsics for bfmmla, bfmlalb and 
bfmlalt as part of the BFloat16 extension.
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
The intrinsics are declared in arm_bf16.h and arm_neon.h and the RTL 
patterns are defined in aarch64-simd.md.
A new test is added to check assembler output.

This patch depends on the two Aarch64 back-end patches. 
(https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01323.html and 
https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01324.html)

Tested for regression on aarch64-none-elf and aarch64_be-none-elf. I 
don't have commit rights, so if this is ok can someone please commit it 
for me?

gcc/ChangeLog:

2019-11-06  Delia Burduv  

 * config/aarch64/aarch64-simd-builtins.def
   (bfcvtn): New built-in function.
   (bfcvtn_q): New built-in function.
   (bfcvtn2): New built-in function.
   (bfcvt): New built-in function.
 * config/aarch64/aarch64-simd.md
   (aarch64_bfcvtn): New pattern.
   (aarch64_bfcvtn2v8bf): New pattern.
   (aarch64_bfcvtbf): New pattern.
 * config/aarch64/arm_bf16.h (float32_t): New typedef.
   (vcvth_bf16_f32): New intrinsic.
 * config/aarch64/arm_bf16.h (vcvt_bf16_f32): New intrinsic.
   (vcvtq_low_bf16_f32): New intrinsic.
   (vcvtq_high_bf16_f32): New intrinsic.
 * config/aarch64/iterators.md (V4SF_TO_BF): New mode iterator.
   (UNSPEC_BFCVTN): New UNSPEC.
   (UNSPEC_BFCVTN2): New UNSPEC.
   (UNSPEC_BFCVT): New UNSPEC.
 * config/arm/types.md (bf_cvt): New type.


gcc/testsuite/ChangeLog:

2019-11-06  Delia Burduv  

 * gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c: New test.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index f4ca35a59704c761fe2ac2b6d401fff7c8aba80d..30a425bd3aec121e78f269f44e188bdb8d39e75f 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -682,3 +682,9 @@
   BUILTIN_VSFDF (UNOP, frint32x, 0)
   BUILTIN_VSFDF (UNOP, frint64z, 0)
   BUILTIN_VSFDF (UNOP, frint64x, 0)
+
+  /* Implemented by aarch64_bfcvtn{q}{2}  */
+  VAR1 (UNOP, bfcvtn, 0, v4bf)
+  VAR1 (UNOP, bfcvtn_q, 0, v8bf)
+  VAR1 (BINOP, bfcvtn2, 0, v8bf)
+  VAR1 (UNOP, bfcvt, 0, bf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 55660ae248f4fa75d35ba2949cd4b9d5139ff5f5..ff7a1f5f34a19b05eba48dba96c736dfdfdf7bac 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7027,3 +7027,32 @@
   "xtn\t%0., %1."
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
+
+;; bfcvtn
+(define_insn "aarch64_bfcvtn"
+  [(set (match_operand:V4SF_TO_BF 0 "register_operand" "=w")
+(unspec:V4SF_TO_BF [(match_operand:V4SF 1 "register_operand" "w")]
+UNSPEC_BFCVTN))]
+  "TARGET_BF16_SIMD"
+  "bfcvtn\\t%0.4h, %1.4s"
+  [(set_attr "type" "f_cvt")]
+)
+
+(define_insn "aarch64_bfcvtn2v8bf"
+  [(set (match_operand:V8BF 0 "register_operand" "=w")
+(unspec:V8BF [(match_operand:V8BF 1 "register_operand" "w")
+  (match_operand:V4SF 2 "register_operand" "w")]
+  UNSPEC_BFCVTN2))]
+  "TARGET_BF16_SIMD"
+  "bfcvtn2\\t%0.8h, %2.4s"
+  [(set_attr "type" "f_cvt")]
+)
+
+(define_insn "aarch64_bfcvtbf"
+  [(set (match_operand:BF 0 "register_operand" "=w")
+(unspec:BF [(match_operand:SF 1 "register_operand" "w")]
+UNSPEC_BFCVT))]
+  "TARGET_BF16_SIMD"
+  "bfcvt\\t%h0, %s1"
+  [(set_attr "type" "f_cvt")]
+)
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
index aedb0972735ce549fac1870bacd1ef3101e8fd26..1b9ab3690d35e153cd4f24b9e3bbb5b4cc4b4f4d 100644
--- a/gcc/config/aarch64/arm_bf16.h
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -34,7 +34,15 @@
 #ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
 
 typedef __bf16 bfloat16_t;
-
+typedef float float32_t;
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvth_bf16_f32 \
+  (float32_t __a)
+{
+  return __builtin_aarch64_bfcvtbf (__a);
+}
 
 #endif
 #pragma GCC pop_options
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 6cdbf381f0156ed993f03b847228b36ebbdd14f8..120f4b7d8827aee51834e75aeaa6ab8f8451980e 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34610,6 +34610,35 @@ vrnd64xq_f64 (float64x2_t __a)
 
 #include "arm_bf16.h"
 
+#pragma GCC push_options
+#pragma GCC target (

[GCC][PATCH][AArch64] ACLE intrinsics bfmmla and bfmlal for AArch64 AdvSIMD

2019-12-20 Thread Delia Burduv
This patch adds the ARMv8.6 ACLE intrinsics for bfmmla, bfmlalb and 
bfmlalt as part of the BFloat16 extension.
(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)
The intrinsics are declared in arm_neon.h and the RTL patterns are 
defined in aarch64-simd.md.
Two new tests are added to check assembler output.

This patch depends on the two Aarch64 back-end patches. 
(https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01323.html and 
https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01324.html)

Tested for regression on aarch64-none-elf and aarch64_be-none-elf. I 
don't have commit rights, so if this is ok can someone please commit it 
for me?

gcc/ChangeLog:

2019-10-29  Delia Burduv  

 * config/aarch64/aarch64-simd-builtins.def
   (bfmmla): New built-in function.
   (bfmlalb): New built-in function.
   (bfmlalt): New built-in function.
   (bfmlalb_lane): New built-in function.
   (bfmlalt_lane): New built-in function.
   (bfmlalb_laneq): New built-in function.
   (bfmlalt_laneq): New built-in function
 * config/aarch64/aarch64-simd.md (bfmmla): New pattern.
   (bfmlal): New patterns.
 * config/aarch64/arm_neon.h (vbfmmlaq_f32): New intrinsic.
   (vbfmlalbq_f32): New intrinsic.
   (vbfmlaltq_f32): New intrinsic.
   (vbfmlalbq_lane_f32): New intrinsic.
   (vbfmlaltq_lane_f32): New intrinsic.
   (vbfmlalbq_laneq_f32): New intrinsic.
   (vbfmlaltq_laneq_f32): New intrinsic.
 * config/aarch64/iterators.md (UNSPEC_BFMMLA): New UNSPEC.
   (UNSPEC_BFMLALB): New UNSPEC.
   (UNSPEC_BFMLALT): New UNSPEC.
   (BF_MLA): New int iterator.
   (bt): Added UNSPEC_BFMLALB, UNSPEC_BFMLALT.
 * config/arm/types.md (bf_mmla): New type.
   (bf_mla): New type.

gcc/testsuite/ChangeLog:

2019-10-29  Delia Burduv  

 * gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c: New 
test.
 * gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c: New test.
 * 
gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c:
   New test.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index f4ca35a59704c761fe2ac2b6d401fff7c8aba80d..5e9f50f090870d0c63916540a48f5ac132d2630d 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -682,3 +682,14 @@
   BUILTIN_VSFDF (UNOP, frint32x, 0)
   BUILTIN_VSFDF (UNOP, frint64z, 0)
   BUILTIN_VSFDF (UNOP, frint64x, 0)
+
+  /* Implemented by aarch64_bfmmlaqv4sf  */
+  VAR1 (TERNOP, bfmmlaq, 0, v4sf)
+
+  /* Implemented by aarch64_bfmlal{_lane{q}}v4sf  */
+  VAR1 (TERNOP, bfmlalb, 0, v4sf)
+  VAR1 (TERNOP, bfmlalt, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_lane, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_lane, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_laneq, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_laneq, 0, v4sf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 55660ae248f4fa75d35ba2949cd4b9d5139ff5f5..66a6c4116a1fdd26dd4eec8b0609e28eb2c38fa1 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7027,3 +7027,57 @@
   "xtn\t%0., %1."
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
+
+;; bfmmla
+(define_insn "aarch64_bfmmlaqv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+(plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
+   (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+ (match_operand:V8BF 3 "register_operand" "w")]
+UNSPEC_BFMMLA)))]
+  "TARGET_BF16_SIMD"
+  "bfmmla\\t%0.4s, %2.8h, %3.8h"
+  [(set_attr "type" "neon_mla_s_q")]
+)
+
+;; bfmlal
+(define_insn "aarch64_bfmlalv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+(plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+(unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+  (match_operand:V8BF 3 "register_operand" "w")]
+ BF_MLA)))]
+  "TARGET_BF16_SIMD"
+  "bfmlal\\t%0.4s, %2.8h, %3.8h"
+  [(set_attr "type" "neon_fp_mla_s")]
+)
+
+(define_insn "aarch64_bfmlal_lanev4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+(plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+(unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+  (match_operand:V4BF 3 "register_oper

Re: [Patch, GCC]Backporting r269039 to gcc8

2019-10-25 Thread Delia Burduv
Hello Jeff,

Yes, it is a backport to gcc-8. No, I don't have commit access. Could you 
please commit it for me?

Thanks,
Delia

From: Jeff Law 
Sent: 04 October 2019 22:27
To: Delia Burduv ; gcc-patches@gcc.gnu.org 

Cc: nd ; i...@airs.com ; rguent...@suse.de 

Subject: Re: [Patch, GCC]Backporting r269039 to gcc8

On 10/4/19 9:11 AM, Delia Burduv wrote:
> Ping. Has anyone had a look at the patch? Please let me know if it is fine.
I think it's fine to backport to the gcc-8 branch.  Do you have commit
access?

jeff


Re: [Patch, GCC]Backporting r269039 to gcc8

2019-10-04 Thread Delia Burduv
Ping. Has anyone had a look at the patch? Please let me know if it is fine.

Thanks,
Delia


From: Delia Burduv 
Sent: 18 September 2019 15:54
To: gcc-patches@gcc.gnu.org 
Cc: nd ; l...@redhat.com ; i...@airs.com 
; rguent...@suse.de 
Subject: [Patch, GCC]Backporting r269039 to gcc8

Hi,

I am trying to backport r269039 to gcc8 wich solved this bug report:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86487 . I have tested it on
arm-none-linux-gnueabi,aarch64-none-linux-gnu and x86_64-pc-linux-gnu
and there was no regression. The patch applied cleanly. I don't have
commit rights, so if it is ok can someone please commit it for me?

Thanks,
Delia

gcc/ChangeLog:
2019-09-13  Delia Burduv  

 Backport from trunk
 2019-02-20  Andre Vieira 

 PR target/86487
 * lra-constraints.c(uses_hard_regs_p): Fix handling of
 paradoxical SUBREGS.

gcc/testsuite/ChangeLog:
2019-09-13  Delia Burduv  

 Backport from trunk
 2019-02-20  Andre Vieira 

 PR target/86487
 * gcc.target/arm/pr86487.c: New.



[Patch, GCC]Backporting r269039 to gcc8

2019-09-18 Thread Delia Burduv
Hi,

I am trying to backport r269039 to gcc8 wich solved this bug report: 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86487 . I have tested it on 
arm-none-linux-gnueabi,aarch64-none-linux-gnu and x86_64-pc-linux-gnu 
and there was no regression. The patch applied cleanly. I don't have 
commit rights, so if it is ok can someone please commit it for me?

Thanks,
Delia

gcc/ChangeLog:
2019-09-13  Delia Burduv  

     Backport from trunk
     2019-02-20  Andre Vieira 

     PR target/86487
     * lra-constraints.c(uses_hard_regs_p): Fix handling of
     paradoxical SUBREGS.

gcc/testsuite/ChangeLog:
2019-09-13  Delia Burduv  

     Backport from trunk
     2019-02-20  Andre Vieira 

     PR target/86487
     * gcc.target/arm/pr86487.c: New.

diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c
index 484e9fa148c32208cd3af39e3aaa944069933ac0..1dea8c959d8f0e7e2d39f0ccf1b97aa1f64b024f 100644
--- a/gcc/lra-constraints.c
+++ b/gcc/lra-constraints.c
@@ -1774,14 +1774,24 @@ uses_hard_regs_p (rtx x, HARD_REG_SET set)
 return false;
   code = GET_CODE (x);
   mode = GET_MODE (x);
+
   if (code == SUBREG)
 {
+  /* For all SUBREGs we want to check whether the full multi-register
+	 overlaps the set.  For normal SUBREGs this means 'get_hard_regno' of
+	 the inner register, for paradoxical SUBREGs this means the
+	 'get_hard_regno' of the full SUBREG and for complete SUBREGs either is
+	 fine.  Use the wider mode for all cases.  */
+  rtx subreg = SUBREG_REG (x);
   mode = wider_subreg_mode (x);
-  x = SUBREG_REG (x);
-  code = GET_CODE (x);
+  if (mode == GET_MODE (subreg))
+	{
+	  x = subreg;
+	  code = GET_CODE (x);
+	}
 }
 
-  if (REG_P (x))
+  if (REG_P (x) || SUBREG_P (x))
 {
   x_hard_regno = get_hard_regno (x, true);
   return (x_hard_regno >= 0
diff --git a/gcc/testsuite/gcc.target/arm/pr86487.c b/gcc/testsuite/gcc.target/arm/pr86487.c
new file mode 100644
index ..1c1db7852d91a82a1d2b6eaa4f3d4c6dbef107f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr86487.c
@@ -0,0 +1,10 @@
+/* { dg-skip-if "" { *-*-* } { "-march=armv[0-6]*" "-mthumb" } { "" } } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-options "-O1 -mbig-endian" } */
+/* { dg-add-options arm_neon } */
+int a, b, c, d;
+long long fn1(long long p2) { return p2 == 0 ? -1 : -1 % p2; }
+void fn2(long long p1, short p2, long p3) {
+  b = fn1((d || 6) & a);
+  c = b | p3;
+}