Hi,

Please find attached the patch that adds first set of X86 instrinsic
headers to AArch64 target.
The implementation is based on similar work targeted at PPC64LE.
https://gcc.gnu.org/ml/gcc-patches/2017-05/msg00550.html

We are using the corresponding DejaGnu tests similar to Powerpc from 
gcc/testsuite/gcc.target/i386/ to gcc/testsuite/gcc.target/aarch64 as the
source remains same. Only modifications are target related as appropriate.

Bootstrapped and Regression tested on aarch64-thunder-linux.

Please review the patch and let us know if any comments or suggestions.

Thanks,
Naveen

2017-05-29  Naveen H.S  <naveen.hurugalaw...@cavium.com>

[gcc]
        * config.gcc (aarch64*-*-*): Add bmi2intrin.h, bmiintrin.h,
        and x86intrin.h
        * config/aarch64/bmi2intrin.h: New file.
        * config/aarch64/bmiintrin.h: New file.
        * config/aarch64/x86intrin.h: New file.

[gcc/testsuite]

        * gcc.target/aarch64/bmi-andn-1.c: New file
        * gcc.target/aarch64/bmi-andn-2.c: New file.
        * gcc.target/aarch64/bmi-bextr-1.c: New file.
        * gcc.target/aarch64/bmi-bextr-2.c: New file.
        * gcc.target/aarch64/bmi-bextr-4.c: New file.
        * gcc.target/aarch64/bmi-bextr-5.c: New file.
        * gcc.target/aarch64/bmi-blsi-1.c: New file.
        * gcc.target/aarch64/bmi-blsi-2.c: New file.
        * gcc.target/aarch64/bmi-blsmsk-1.c: new file.
        * gcc.target/aarch64/bmi-blsmsk-2.c: New file.
        * gcc.target/aarch64/bmi-blsr-1.c: New file.
        * gcc.target/aarch64/bmi-blsr-2.c: New File.
        * gcc.target/aarch64/bmi-check.h: New File.
        * gcc.target/aarch64/bmi-tzcnt-1.c: new file.
        * gcc.target/aarch64/bmi-tzcnt-2.c: New file.
        * gcc.target/aarch64/bmi2-bzhi32-1.c: New file.
        * gcc.target/aarch64/bmi2-bzhi64-1.c: New file.
        * gcc.target/aarch64/bmi2-bzhi64-1a.c: New file.
        * gcc.target/aarch64/bmi2-check.h: New file.
        * gcc.target/aarch64/bmi2-mulx32-1.c: New file.
        * gcc.target/aarch64/bmi2-mulx32-2.c: New file.
        * gcc.target/aarch64/bmi2-mulx64-1.c: New file.
        * gcc.target/aarch64/bmi2-mulx64-2.c: New file.
        * gcc.target/aarch64/bmi2-pdep32-1.c: New file.
        * gcc.target/aarch64/bmi2-pdep64-1.c: New file.
        * gcc.target/aarch64/bmi2-pext32-1.c: New File.
        * gcc.target/aarch64/bmi2-pext64-1.c: New file.
        * gcc.target/aarch64/bmi2-pext64-1a.c: New File.
diff --git a/gcc/config.gcc b/gcc/config.gcc
index f55dcaa..9eac70e 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -301,6 +301,7 @@ m32c*-*-*)
 aarch64*-*-*)
 	cpu_type=aarch64
 	extra_headers="arm_fp16.h arm_neon.h arm_acle.h"
+	extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h x86intrin.h"
 	c_target_objs="aarch64-c.o"
 	cxx_target_objs="aarch64-c.o"
 	extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o"
diff --git a/gcc/config/aarch64/bmi2intrin.h b/gcc/config/aarch64/bmi2intrin.h
new file mode 100644
index 0000000..c797f22
--- /dev/null
+++ b/gcc/config/aarch64/bmi2intrin.h
@@ -0,0 +1,148 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to Aarch64.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+
+#if !defined _X86INTRIN_H_INCLUDED
+# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _BMI2INTRIN_H_INCLUDED
+#define _BMI2INTRIN_H_INCLUDED
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u32 (unsigned int __X, unsigned int __Y)
+{
+  return ((__X << (32 - __Y)) >> (32 - __Y));
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+  unsigned long long __res = (unsigned long long) __X * __Y;
+  *__P = (unsigned int) (__res >> 32);
+  return (unsigned int) __res;
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ((__X << (64 - __Y)) >> (64 - __Y));
+}
+
+/* __int128 requires base 64-bit.  */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+	   unsigned long long *__P)
+{
+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
+  *__P = (unsigned long long) (__res >> 64);
+  return (unsigned long long) __res;
+}
+
+#ifndef __ILP32__
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u64 (unsigned long long __X, unsigned long long __M)
+{
+  unsigned long result = 0x0UL;
+  const unsigned long mask = 0x8000000000000000UL;
+  unsigned long m = __M;
+  unsigned long c, t;
+  unsigned long p;
+
+  /* The pop-count of the mask gives the number of the bits from
+   source to process.  This is also needed to shift bits from the
+   source into the correct position for the result.  */
+  p = 64 - __builtin_popcountl (__M);
+
+  /* The loop is for the number of '1' bits in the mask and clearing
+   each mask bit as it is processed.  */
+  while (m != 0)
+    {
+      c = __builtin_clzl (m);
+      t = __X << (p - c);
+      m ^= (mask >> c);
+      result |= (t & (mask >> c));
+      p++;
+    }
+  return (result);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u64 (unsigned long long __X, unsigned long long __M)
+{
+  unsigned long p = 0x4040404040404040UL; // initial bit permute control
+  const unsigned long mask = 0x8000000000000000UL;
+  unsigned long m = __M;
+  unsigned long c;
+  unsigned long result;
+
+  p = 64 - __builtin_popcountl (__M);
+  result = 0;
+  /* We could a use a for loop here, but that combined with
+   -funroll-loops can expand to a lot of code.  The while
+   loop avoids unrolling and the compiler commons the xor
+   from clearing the mask bit with the (m != 0) test.  The
+   result is a more compact loop setup and body.  */
+   while (m != 0)
+     {
+       unsigned long t;
+       c = __builtin_clzl (m);
+       t = (__X & (mask >> c)) >> (p - c);
+       m ^= (mask >> c);
+       result |= (t);
+       p++;
+     }
+  return (result);
+}
+
+/* these 32-bit implementations depend on 64-bit pdep/pext.  */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u32 (unsigned int __X, unsigned int __Y)
+{
+  return _pdep_u64 (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u32 (unsigned int __X, unsigned int __Y)
+{
+  return _pext_u64 (__X, __Y);
+}
+
+#endif /* __ILP32__ */
+
+#endif /* _BMI2INTRIN_H_INCLUDED */
diff --git a/gcc/config/aarch64/bmiintrin.h b/gcc/config/aarch64/bmiintrin.h
new file mode 100644
index 0000000..b418a3f
--- /dev/null
+++ b/gcc/config/aarch64/bmiintrin.h
@@ -0,0 +1,208 @@
+/* Copyright (C) 2010-2017 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to Aarch64.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+
+#if !defined _X86INTRIN_H_INCLUDED
+# error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _BMIINTRIN_H_INCLUDED
+#define _BMIINTRIN_H_INCLUDED
+
+extern __inline unsigned short
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u16 (unsigned short __X)
+{
+  return __builtin_ctz (__X);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__andn_u32 (unsigned int __X, unsigned int __Y)
+{
+  return (~__X & __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bextr_u32 (unsigned int __X, unsigned int __P, unsigned int __L)
+{
+  return ((__X << (32 - (__L + __P))) >> (32 - __L));
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextr_u32 (unsigned int __X, unsigned int __Y)
+{
+  unsigned int __P, __L;
+  __P = __Y & 0xFF;
+  __L = (__Y >> 8) & 0xFF;
+  return (_bextr_u32 (__X, __P, __L));
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsi_u32 (unsigned int __X)
+{
+  return (__X & -__X);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsi_u32 (unsigned int __X)
+{
+  return __blsi_u32 (__X);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsmsk_u32 (unsigned int __X)
+{
+  return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsmsk_u32 (unsigned int __X)
+{
+  return __blsmsk_u32 (__X);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsr_u32 (unsigned int __X)
+{
+  return (__X & (__X - 1));
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsr_u32 (unsigned int __X)
+{
+  return __blsr_u32 (__X);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u32 (unsigned int __X)
+{
+  return __builtin_ctz (__X);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tzcnt_u32 (unsigned int __X)
+{
+  return __builtin_ctz (__X);
+}
+
+/* use the 64-bit shift, rotate, and count leading zeros instructions
+   for long long.  */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return (~__X & __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bextr_u64 (unsigned long long __X, unsigned int __P, unsigned int __L)
+{
+  return ((__X << (64 - (__L + __P))) >> (64 - __L));
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextr_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  unsigned int __P, __L;
+  __P = __Y & 0xFF;
+  __L = (__Y & 0xFF00) >> 8;
+  return (_bextr_u64 (__X, __P, __L));
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsi_u64 (unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsi_u64 (unsigned long long __X)
+{
+  return __blsi_u64 (__X);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsmsk_u64 (unsigned long long __X)
+{
+  return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsmsk_u64 (unsigned long long __X)
+{
+  return __blsmsk_u64 (__X);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsr_u64 (unsigned long long __X)
+{
+  return (__X & (__X - 1));
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsr_u64 (unsigned long long __X)
+{
+  return __blsr_u64 (__X);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u64 (unsigned long long __X)
+{
+  return __builtin_ctzll (__X);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tzcnt_u64 (unsigned long long __X)
+{
+  return __builtin_ctzll (__X);
+}
+
+#endif /* _BMIINTRIN_H_INCLUDED */
diff --git a/gcc/config/aarch64/x86intrin.h b/gcc/config/aarch64/x86intrin.h
new file mode 100644
index 0000000..2044734
--- /dev/null
+++ b/gcc/config/aarch64/x86intrin.h
@@ -0,0 +1,43 @@
+/* Copyright (C) 2008-2017 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to Aarch64.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#warning "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this warning."
+#endif
+
+#ifndef _X86INTRIN_H_INCLUDED
+#define _X86INTRIN_H_INCLUDED
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+
+#endif /* _X86INTRIN_H_INCLUDED */
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-andn-1.c b/gcc/testsuite/gcc.target/aarch64/bmi-andn-1.c
new file mode 100644
index 0000000..2cd8331
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-andn-1.c
@@ -0,0 +1,32 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_andn_u64 (long long src1,
+			 long long src2,
+			 long long dummy)
+{
+  return (~src1 + dummy) & (src2);
+}
+
+static void
+bmi_test()
+{
+  unsigned i;
+
+  long long src = 0xfacec0ffeefacec0;
+  long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = (i + src) << i;
+
+    res_ref = calc_andn_u64 (src, src+i, 0);
+    res = __andn_u64 (src, src+i);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-andn-2.c b/gcc/testsuite/gcc.target/aarch64/bmi-andn-2.c
new file mode 100644
index 0000000..5d58acb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-andn-2.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_andn_u32 (int src1, int src2, int dummy)
+{
+  return (~src1+dummy) & (src2);
+}
+
+static void
+bmi_test()
+{
+  unsigned i;
+
+  int src = 0xfacec0ff;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = (i + src) << i;
+
+    res_ref = calc_andn_u32 (src, src+i, 0);
+    res = __andn_u32 (src, src+i);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-bextr-1.c b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-1.c
new file mode 100644
index 0000000..1ce15cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-1.c
@@ -0,0 +1,49 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_bextr_u64 (unsigned long long src1,
+			  unsigned long long src2)
+{
+  long long res = 0;
+  unsigned char start = (src2 & 0xff);
+  unsigned char len = (int) ((src2 >> 8) & 0xff);
+  if (start < 64) {
+    unsigned i;
+    unsigned last = (start+len) < 64 ? start+len : 64;
+
+    src1 >>= start;
+    for (i=start; i<last; ++i) {
+      res |= (src1 & 1) << (i-start);
+      src1 >>= 1;
+    }
+  }
+
+  return res;
+}
+
+static void
+bmi_test ()
+{
+  unsigned i;
+  unsigned char start, len;
+  unsigned long long src1 = 0xfacec0ffeefacec0;
+  unsigned long long res, res_ref, src2;
+
+  for (i=0; i<5; ++i) {
+    start = (i * 1983) % 64;
+    len = (i + (i * 1983)) % 64;
+
+    src1 = src1 * 3;
+    src2 = start | (((unsigned long long)len) << 8);
+
+    res_ref = calc_bextr_u64 (src1, src2);
+    res = __bextr_u64 (src1, src2);
+
+    if (res != res_ref)
+      abort ();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-bextr-2.c b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-2.c
new file mode 100644
index 0000000..cdaf133
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-2.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+unsigned calc_bextr_u32 (unsigned src1, unsigned src2)
+{
+  unsigned res = 0;
+  unsigned char start = (src2 & 0xff);
+  unsigned char len = (int) ((src2 >> 8) & 0xff);
+  if (start < 32) {
+    unsigned i;
+    unsigned last = (start+len) < 32 ? start+len : 32;
+
+    src1 >>= start;
+    for (i=start; i<last; ++i) {
+      res |= (src1 & 1) << (i-start);
+      src1 >>= 1;
+    }
+  }
+
+  return res;
+}
+
+static void
+bmi_test ()
+{
+  unsigned i;
+  unsigned char start, len;
+  unsigned src1 = 0xfacec0ff;
+  unsigned res, res_ref, src2;
+
+  for (i=0; i<5; ++i) {
+    start = (i * 1983) % 32;
+    len = (i + (i * 1983)) % 32;
+
+    src1 = src1 * 3;
+    src2 = start | (((unsigned)len) << 8);
+
+    res_ref = calc_bextr_u32 (src1, src2);
+    res = __bextr_u32 (src1, src2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-bextr-4.c b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-4.c
new file mode 100644
index 0000000..2f2acbe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-4.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+unsigned calc_bextr_u32 (unsigned src1, unsigned src2)
+{
+  unsigned res = 0;
+  unsigned char start = (src2 & 0xff);
+  unsigned char len = (int) ((src2 >> 8) & 0xff);
+  if (start < 32) {
+    unsigned i;
+    unsigned last = (start+len) < 32 ? start+len : 32;
+
+    src1 >>= start;
+    for (i=start; i<last; ++i) {
+      res |= (src1 & 1) << (i-start);
+      src1 >>= 1;
+    }
+  }
+
+  return res;
+}
+
+static void
+bmi_test ()
+{
+  unsigned i;
+  unsigned char start, len;
+  unsigned src1 = 0xfacec0ff;
+  unsigned res, res_ref, src2;
+
+  for (i=0; i<5; ++i) {
+    start = i * 4;
+    len = i * 4;
+
+    src1 = src1 * 3;
+    src2 = (start & 0xff) | ((len & 0xff) << 8);
+
+    res_ref = calc_bextr_u32 (src1, src2);
+    res = _bextr_u32 (src1, start, len);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-bextr-5.c b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-5.c
new file mode 100644
index 0000000..2cfa24f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-bextr-5.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_bextr_u64 (unsigned long long src1,
+			  unsigned long long src2)
+{
+  long long res = 0;
+  unsigned char start = (src2 & 0xff);
+  unsigned char len = (int) ((src2 >> 8) & 0xff);
+  if (start < 64) {
+    unsigned i;
+    unsigned last = (start+len) < 64 ? start+len : 64;
+
+    src1 >>= start;
+    for (i=start; i<last; ++i) {
+      res |= (src1 & 1) << (i-start);
+      src1 >>= 1;
+    }
+  }
+
+  return res;
+}
+
+static void
+bmi_test ()
+{
+  unsigned i;
+  unsigned char start, len;
+  unsigned long long src1 = 0xfacec0ffeefacec0;
+  unsigned long long res, res_ref, src2;
+
+  for (i=0; i<5; ++i) {
+    start = i * 4;
+    len = i * 3;
+    src1 = src1 * 3;
+    src2 = (start & 0xff) | ((len & 0xff) << 8);
+
+    res_ref = calc_bextr_u64 (src1, src2);
+    res = _bextr_u64 (src1, start, len);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-blsi-1.c b/gcc/testsuite/gcc.target/aarch64/bmi-blsi-1.c
new file mode 100644
index 0000000..8c69a98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-blsi-1.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+/* To fool the compiler, so it does not generate blsi here. */
+long long calc_blsi_u64 (long long src1, long long src2)
+{
+  return (-src1) & (src2);
+}
+
+static void
+bmi_test()
+{
+  unsigned i;
+
+  long long src = 0xfacec0ffeefacec0;
+  long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = (i + src) << i;
+
+    res_ref = calc_blsi_u64 (src, src);
+    res = __blsi_u64 (src);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-blsi-2.c b/gcc/testsuite/gcc.target/aarch64/bmi-blsi-2.c
new file mode 100644
index 0000000..8dcac7a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-blsi-2.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+/* To fool the compiler, so it does not generate blsi here. */
+int calc_blsi_u32 (int src1, int src2)
+{
+  return (-src1) & (src2);
+}
+
+static void
+bmi_test()
+{
+  unsigned i;
+  int src = 0xfacec0ff;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = (i + src) << i;
+
+    res_ref = calc_blsi_u32 (src, src);
+    res = __blsi_u32 (src);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-blsmsk-1.c b/gcc/testsuite/gcc.target/aarch64/bmi-blsmsk-1.c
new file mode 100644
index 0000000..e0856ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-blsmsk-1.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+/*  Trick compiler in order not to generate target insn here. */
+long long calc_blsmsk_u64 (long long src1, long long src2)
+{
+  return (src1-1) ^ (src2);
+}
+
+static void
+bmi_test ()
+{
+  unsigned i;
+  long long src = 0xfacec0ffeefacec0;
+  long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = (i + src) << i;
+
+    res_ref = calc_blsmsk_u64 (src, src);
+    res = __blsmsk_u64 (src);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-blsmsk-2.c b/gcc/testsuite/gcc.target/aarch64/bmi-blsmsk-2.c
new file mode 100644
index 0000000..67cdd08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-blsmsk-2.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+/*  Trick compiler in order not to generate target insn here. */
+int calc_blsmsk_u32 (int src1, int src2)
+{
+  return (src1-1) ^ (src2);
+}
+
+static void
+bmi_test ()
+{
+  unsigned i;
+  int src = 0xfacec0ff;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = (i + src) << i;
+
+    res_ref = calc_blsmsk_u32 (src, src);
+    res = __blsmsk_u32 (src);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-blsr-1.c b/gcc/testsuite/gcc.target/aarch64/bmi-blsr-1.c
new file mode 100644
index 0000000..174fac8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-blsr-1.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_blsr_u64 (long long src1, long long src2)
+{
+  return (src1-1) & (src2);
+}
+
+static void
+bmi_test()
+{
+  unsigned i;
+  long long src = 0xfacec0ffeefacec0;
+  long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = (i + src) << i;
+
+    res_ref = calc_blsr_u64 (src, src);
+    res = __blsr_u64 (src);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-blsr-2.c b/gcc/testsuite/gcc.target/aarch64/bmi-blsr-2.c
new file mode 100644
index 0000000..820657c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-blsr-2.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+int calc_blsr_u32 (int src1, int src2)
+{
+  return (src1-1) & (src2);
+}
+
+static void
+bmi_test ()
+{
+  unsigned i;
+  int src = 0xfacec0ff;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = (i + src) << i;
+
+    res_ref = calc_blsr_u32 (src, src);
+    res = __blsr_u32 (src);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-check.h b/gcc/testsuite/gcc.target/aarch64/bmi-check.h
new file mode 100644
index 0000000..2ddad62
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-check.h
@@ -0,0 +1,20 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+static void bmi_test (void);
+
+static void __attribute__ ((noinline)) do_test (void)
+{
+  bmi_test ();
+}
+
+int
+main ()
+{
+  do_test ();
+#ifdef DEBUG
+  printf ("PASSED\n");
+#endif
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-tzcnt-1.c b/gcc/testsuite/gcc.target/aarch64/bmi-tzcnt-1.c
new file mode 100644
index 0000000..267c4b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-tzcnt-1.c
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_tzcnt_u64 (long long src)
+{
+  int i;
+  int res = 0;
+
+  while ( (res<64) && ((src&1) == 0)) {
+    ++res;
+    src >>= 1;
+  }
+
+  return res;
+}
+
+static void
+bmi_test ()
+{
+  unsigned i;
+  long long src = 0xfacec0ffeefacec0;
+  long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = (i + src) << i;
+
+    res_ref = calc_tzcnt_u64 (src);
+    res = __tzcnt_u64 (src);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi-tzcnt-2.c b/gcc/testsuite/gcc.target/aarch64/bmi-tzcnt-2.c
new file mode 100644
index 0000000..2414c6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi-tzcnt-2.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+int calc_tzcnt_u32 (int src)
+{
+  int i;
+  int res = 0;
+
+  while ( (res<32) && ((src&1) == 0)) {
+    ++res;
+    src >>= 1;
+  }
+  return res;
+}
+
+static void
+bmi_test ()
+{
+  unsigned i;
+  int src = 0xfacec0ff;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = i + (src << i);
+
+    res_ref = calc_tzcnt_u32 (src);
+    res = __tzcnt_u32 (src);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi32-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi32-1.c
new file mode 100644
index 0000000..35c56ce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_bzhi_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i = 0; i < 32 - l; ++i)
+    res &= ~(1 << (31 - i));
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0f;
+  unsigned res, res_ref;
+
+  for (i = 0; i < 5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_bzhi_u32 (src, i * 2);
+    res = _bzhi_u32 (src, i * 2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi64-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi64-1.c
new file mode 100644
index 0000000..0205aa2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi64-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_bzhi_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i = 0; i < 64 - l; ++i)
+    res &= ~(1LL << (63 - i));
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0ff;
+  unsigned long long res, res_ref;
+
+  for (i = 0; i < 5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_bzhi_u64 (src, i * 2);
+    res = _bzhi_u64 (src, i * 2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi64-1a.c b/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi64-1a.c
new file mode 100644
index 0000000..ce3b8a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-bzhi64-1a.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+
+unsigned long long
+test__bzhi_u64_group (unsigned long long a)
+{
+  /* bzhi is implemented in source as shift left then shift right
+   to clear the high order bits.
+   For the case where the starting index is const, the compiler
+   should reduces this to a single Rotate Left Doubleword
+   Immediate then Clear Left (rldicl) instruction.  */
+  unsigned long long res;
+  res = _bzhi_u64 (a, 8);
+  res += _bzhi_u64 (a, 16);
+  res += _bzhi_u64 (a, 24);
+  res += _bzhi_u64 (a, 32);
+  res += _bzhi_u64 (a, 40);
+  res += _bzhi_u64 (a, 48);
+  return (res);
+}
+/* the resulting assembler should have 6 X rldicl and no sld or
+   srd instructions.  */
+
+/* { dg-final { scan-assembler-not "sld" } } */
+/* { dg-final { scan-assembler-not "srd" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-check.h b/gcc/testsuite/gcc.target/aarch64/bmi2-check.h
new file mode 100644
index 0000000..567cdb7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-check.h
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+static void bmi2_test (void);
+
+static void __attribute__ ((noinline)) do_test (void)
+{
+  bmi2_test ();
+}
+
+int
+main ()
+{
+  do_test ();
+
+#ifdef DEBUG
+  printf ("PASSED\n");
+#endif
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-mulx32-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx32-1.c
new file mode 100644
index 0000000..14357fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx32-1.c
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned volatile a, unsigned b)
+{
+  unsigned long long res = 0;
+  int i;
+  for (i = 0; i < b; ++i)
+    res += a;
+
+  return res;
+}
+
+__attribute__((noinline))
+unsigned long long
+gen_mulx (unsigned a, unsigned b)
+{
+  unsigned long long res;
+
+  res = (unsigned long long)a * b;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned a = 0xce7ace0;
+  unsigned b = 0xfacefff;
+  unsigned long long res, res_ref;
+
+  for (i = 0; i < 5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u32 (a, b);
+    res = gen_mulx (a, b);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-mulx32-2.c b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx32-2.c
new file mode 100644
index 0000000..440551f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx32-2.c
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned volatile a, unsigned b)
+{
+  unsigned long long res = 0;
+  int i;
+  for (i = 0; i < b; ++i)
+    res += a;
+
+  return res;
+}
+
+__attribute__((noinline))
+unsigned calc_mulx_u32 (unsigned x, unsigned y, unsigned *res_h)
+{
+  return (unsigned) _mulx_u32 (x, y, res_h);
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned a = 0xce7ace0;
+  unsigned b = 0xfacefff;
+  unsigned res_l, res_h;
+  unsigned long long res, res_ref;
+
+  for (i = 0; i < 5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u32 (a, b);
+    res_l = calc_mulx_u32 (a, b, &res_h);
+
+    res = ((unsigned long long) res_h << 32) | res_l;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-mulx64-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx64-1.c
new file mode 100644
index 0000000..eb5f2c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx64-1.c
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned __int128
+calc_mul_u64 (unsigned long long volatile a, unsigned long long b)
+{
+  unsigned __int128 res = 0;
+  int i;
+  for (i = 0; i < b; ++i)
+    res += (unsigned __int128) a;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long a = 0xce7ace0ce7ace0;
+  unsigned long long b = 0xface;
+  unsigned __int128 res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u64 (a, b);
+    res = (unsigned __int128) a * b;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-mulx64-2.c b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx64-2.c
new file mode 100644
index 0000000..8afc1f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-mulx64-2.c
@@ -0,0 +1,52 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned __int128
+calc_mul_u64 (unsigned long long volatile a, unsigned long long b)
+{
+  unsigned __int128 res = 0;
+  int i;
+  for (i = 0; i < b; ++i)
+    res += (unsigned __int128) a;
+
+  return res;
+}
+
+__attribute__((noinline))
+unsigned long long
+calc_mulx_u64 (unsigned long long x,
+	       unsigned long long y,
+	       unsigned long long *res_h)
+{
+  return _mulx_u64 (x, y, res_h);
+}
+
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long a = 0xce7ace0ce7ace0;
+  unsigned long long b = 0xface;
+  unsigned long long res_l, res_h;
+  unsigned __int128 res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u64 (a, b);
+
+    res_l = calc_mulx_u64 (a, b, &res_h);
+
+    res = ((unsigned __int128) res_h << 64) | res_l;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-pdep32-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-pdep32-1.c
new file mode 100644
index 0000000..d08b869
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-pdep32-1.c
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+/* { dg-require-effective-target lp64 } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pdep_u32 (unsigned a, int mask)
+{
+  unsigned res = 0;
+  int i, k = 0;
+
+  for (i = 0; i < 32; ++i)
+    if (mask & (1 << i)) {
+      res |= ((a & (1 << k)) >> k) << i;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7acc;
+  unsigned res, res_ref;
+
+  for (i = 0; i < 5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pdep_u32 (src, i * 3);
+    res = _pdep_u32 (src, i * 3);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-pdep64-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-pdep64-1.c
new file mode 100644
index 0000000..1b97ec1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-pdep64-1.c
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+/* { dg-require-effective-target lp64 } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pdep_u64 (unsigned long long a, unsigned long long mask)
+{
+  unsigned long long res = 0;
+  unsigned long long i, k = 0;
+
+  for (i = 0; i < 64; ++i)
+    if (mask & (1LL << i)) {
+      res |= ((a & (1LL << k)) >> k) << i;
+      ++k;
+    }
+  return res;
+}
+
+static
+void
+bmi2_test ()
+{
+  unsigned long long i;
+  unsigned long long src = 0xce7acce7acce7ac;
+  unsigned long long res, res_ref;
+
+  for (i = 0; i < 5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pdep_u64 (src, ~(i * 3));
+    res = _pdep_u64 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort ();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-pext32-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-pext32-1.c
new file mode 100644
index 0000000..9a8309c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-pext32-1.c
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+/* { dg-require-effective-target lp64 } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pext_u32 (unsigned a, unsigned mask)
+{
+  unsigned res = 0;
+  int i, k = 0;
+
+  for (i = 0; i < 32; ++i)
+    if (mask & (1 << i)) {
+      res |= ((a & (1 << i)) >> i) << k;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7acc;
+  unsigned res, res_ref;
+
+  for (i = 0; i < 5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pext_u32 (src, ~(i * 3));
+    res = _pext_u32 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-pext64-1.c b/gcc/testsuite/gcc.target/aarch64/bmi2-pext64-1.c
new file mode 100644
index 0000000..a7889f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-pext64-1.c
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+/* { dg-require-effective-target lp64 } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pext_u64 (unsigned long long a, unsigned long long mask)
+{
+  unsigned long long res = 0;
+  int i, k = 0;
+
+  for (i = 0; i < 64; ++i)
+    if (mask & (1LL << i)) {
+      res |= ((a & (1LL << i)) >> i) << k;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned long long i;
+  unsigned long long src = 0xce7acce7acce7ac;
+  unsigned long long res, res_ref;
+
+  for (i = 0; i < 5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pext_u64 (src, ~(i * 3));
+    res = _pext_u64 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bmi2-pext64-1a.c b/gcc/testsuite/gcc.target/aarch64/bmi2-pext64-1a.c
new file mode 100644
index 0000000..6fa828e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bmi2-pext64-1a.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-require-effective-target lp64 } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+
+unsigned long long
+test__pexp_cmask_u64 (unsigned long long a[4])
+{
+  /* The _pext implmentation is nominally a popcount of the mask,
+     followed by a loop using count leading zeros to find the
+     next bit to process.
+     If the mask is a const, the popcount should be folded and
+     the constant propagation should eliminate the mask
+     generation loop and produce a single constant bpermd permute
+     control word.
+     This test verifies that the compiler is replacing the mask
+     popcount and loop with a const bperm control and generating
+     the bpermd for this case.  */
+  const unsigned long mask = 0x00000000100000a4UL;
+  unsigned long res;
+  res = _pext_u64 (a[0], mask);
+  res = (res << 8) | _pext_u64 (a[1], mask);
+  res = (res << 8) | _pext_u64 (a[2], mask);
+  res = (res << 8) | _pext_u64 (a[3], mask);
+  return (res);
+}
+/* the resulting assembler should have 4 X bpermd and no popcntd or
+   cntlzd instructions.  */
+
+/* { dg-final { scan-assembler-not "popcntd" } } */
+/* { dg-final { scan-assembler-not "cntlzd" } } */

Reply via email to