Hi Paul,
On 8/23/21 2:03 PM, Paul A. Clarke wrote:
Function signatures and decorations match gcc/config/i386/smmintrin.h.
Also, copy tests for:
- _mm_cmpeq_epi64
- _mm_mullo_epi32, _mm_mul_epi32
- _mm_packus_epi32
- _mm_cmpgt_epi64 (SSE4.2)
from gcc/testsuite/gcc.target/i386.
2021-08-23 Paul A. Clarke <p...@us.ibm.com>
gcc
* config/rs6000/smmintrin.h (_mm_cmpeq_epi64, _mm_cmpgt_epi64,
_mm_mullo_epi32, _mm_mul_epi32, _mm_packus_epi32): New.
* config/rs6000/nmmintrin.h: Copy from i386, tweak to suit.
gcc/testsuite
* gcc.target/powerpc/pr78102.c: Copy from gcc.target/i386,
adjust dg directives to suit.
* gcc.target/powerpc/sse4_1-packusdw.c: Same.
* gcc.target/powerpc/sse4_1-pcmpeqq.c: Same.
* gcc.target/powerpc/sse4_1-pmuldq.c: Same.
* gcc.target/powerpc/sse4_1-pmulld.c: Same.
* gcc.target/powerpc/sse4_2-pcmpgtq.c: Same.
* gcc.target/powerpc/sse4_2-check.h: Copy from gcc.target/i386,
tweak to suit.
---
v3:
- Add nmmintrin.h. _mm_cmpgt_epi64 is part of SSE4.2, which is
ostensibly defined in nmmintrin.h. Following the i386 implementation,
however, nmmintrin.h only includes smmintrin.h, and the actual
implementations appear there.
- Add sse4_2-check.h, required by sse4_2-pcmpgtq.c. My testing was
obviously inadequate.
v2:
- Added "extern" to functions to maintain compatible decorations with
like implementations in gcc/config/i386.
- Removed "-Wno-psabi" from tests as unnecessary, per v1 review.
- Noted testing in patch series cover letter.
gcc/config/rs6000/nmmintrin.h | 40 ++++++++++
gcc/config/rs6000/smmintrin.h | 41 +++++++++++
gcc/testsuite/gcc.target/powerpc/pr78102.c | 23 ++++++
.../gcc.target/powerpc/sse4_1-packusdw.c | 73 +++++++++++++++++++
.../gcc.target/powerpc/sse4_1-pcmpeqq.c | 46 ++++++++++++
.../gcc.target/powerpc/sse4_1-pmuldq.c | 51 +++++++++++++
.../gcc.target/powerpc/sse4_1-pmulld.c | 46 ++++++++++++
.../gcc.target/powerpc/sse4_2-check.h | 18 +++++
.../gcc.target/powerpc/sse4_2-pcmpgtq.c | 46 ++++++++++++
9 files changed, 384 insertions(+)
create mode 100644 gcc/config/rs6000/nmmintrin.h
create mode 100644 gcc/testsuite/gcc.target/powerpc/pr78102.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
diff --git a/gcc/config/rs6000/nmmintrin.h b/gcc/config/rs6000/nmmintrin.h
new file mode 100644
index 000000000000..20a70bee3776
--- /dev/null
+++ b/gcc/config/rs6000/nmmintrin.h
@@ -0,0 +1,40 @@
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+ makes explicit use of Intel intrinsics to powerpc64le.
+ It is the user's responsibility to determine if the results are
+ acceptable and make additional changes as necessary.
+ Note that much code that uses Intel intrinsics can be rewritten in
+ standard C or GNU C extensions, which are more portable and better
+ optimized across multiple targets. */
+#endif
+
+#ifndef _NMMINTRIN_H_INCLUDED
+#define _NMMINTRIN_H_INCLUDED
+
+/* We just include SSE4.1 header file. */
+#include <smmintrin.h>
+
+#endif /* _NMMINTRIN_H_INCLUDED */
Should there be something in here indicating that nmmintrin.h is for SSE
4.2? Otherwise it's a bit of a head-scratcher to a new person wondering
why this file exists. No big deal either way.
This looks fine to me with or without that. Recommend approval.
Thanks!
Bill
diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index fdef6674d16c..c04d2bb5b6d3 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -386,6 +386,15 @@ _mm_testnzc_si128 (__m128i __A, __m128i __B)
#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_cmpeq ((__v2di)__X, (__v2di)__Y);
+}
+#endif
+
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi8 (__m128i __X, __m128i __Y)
@@ -444,6 +453,22 @@ _mm_max_epu32 (__m128i __X, __m128i __Y)
extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_mul ((__v4su)__X, (__v4su)__Y);
+}
+
+#ifdef _ARCH_PWR8
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_mule ((__v4si)__X, (__v4si)__Y);
+}
+#endif
+
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi16 (__m128i __A)
{
return (__m128i) vec_unpackh ((__v16qi)__A);
@@ -607,4 +632,20 @@ _mm_minpos_epu16 (__m128i __A)
return __r.__m;
}
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_packsu ((__v4si)__X, (__v4si)__Y);
+}
+
+#ifdef _ARCH_PWR8
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_cmpgt ((__v2di)__X, (__v2di)__Y);
+}
+#endif
+
#endif
diff --git a/gcc/testsuite/gcc.target/powerpc/pr78102.c
b/gcc/testsuite/gcc.target/powerpc/pr78102.c
new file mode 100644
index 000000000000..56a2d497bbff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr78102.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#include <x86intrin.h>
+
+__m128i
+foo (const __m128i x, const __m128i y)
+{
+ return _mm_cmpeq_epi64 (x, y);
+}
+
+__v2di
+bar (const __v2di x, const __v2di y)
+{
+ return x == y;
+}
+
+__v2di
+baz (const __v2di x, const __v2di y)
+{
+ return x != y;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
b/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
new file mode 100644
index 000000000000..15b8ca418f54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
@@ -0,0 +1,73 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static unsigned short
+int_to_ushort (int iVal)
+{
+ unsigned short sVal;
+
+ if (iVal < 0)
+ sVal = 0;
+ else if (iVal > 0xffff)
+ sVal = 0xffff;
+ else sVal = iVal;
+
+ return sVal;
+}
+
+static void
+TEST (void)
+{
+ union
+ {
+ __m128i x[NUM / 4];
+ int i[NUM];
+ } src1, src2;
+ union
+ {
+ __m128i x[NUM / 4];
+ unsigned short s[NUM * 2];
+ } dst;
+ int i, sign = 1;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.i[i] = i * i * sign;
+ src2.i[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 4)
+ dst.x[i / 4] = _mm_packus_epi32 (src1.x [i / 4], src2.x [i / 4]);
+
+ for (i = 0; i < NUM; i ++)
+ {
+ int dstIndex;
+ unsigned short sVal;
+
+ sVal = int_to_ushort (src1.i[i]);
+ dstIndex = (i % 4) + (i / 4) * 8;
+ if (sVal != dst.s[dstIndex])
+ abort ();
+
+ sVal = int_to_ushort (src2.i[i]);
+ dstIndex += 4;
+ if (sVal != dst.s[dstIndex])
+ abort ();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
b/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
new file mode 100644
index 000000000000..39b9f01d64a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mpower8-vector" } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+ union
+ {
+ __m128i x[NUM / 2];
+ long long ll[NUM];
+ } dst, src1, src2;
+ int i, sign=1;
+ long long is_eq;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.ll[i] = i * i * sign;
+ src2.ll[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 2)
+ dst.x [i / 2] = _mm_cmpeq_epi64(src1.x [i / 2], src2.x [i / 2]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ is_eq = src1.ll[i] == src2.ll[i] ? 0xffffffffffffffffLL : 0LL;
+ if (is_eq != dst.ll[i])
+ abort ();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
new file mode 100644
index 000000000000..6a884f46235f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mpower8-vector" } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+ union
+ {
+ __m128i x[NUM / 2];
+ long long ll[NUM];
+ } dst;
+ union
+ {
+ __m128i x[NUM / 2];
+ int i[NUM * 2];
+ } src1, src2;
+ int i, sign = 1;
+ long long value;
+
+ for (i = 0; i < NUM * 2; i += 2)
+ {
+ src1.i[i] = i * i * sign;
+ src2.i[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 2)
+ dst.x[i / 2] = _mm_mul_epi32 (src1.x[i / 2], src2.x[i / 2]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ value = (long long) src1.i[i * 2] * (long long) src2.i[i * 2];
+ if (value != dst.ll[i])
+ abort ();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
new file mode 100644
index 000000000000..150832915911
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+ union
+ {
+ __m128i x[NUM / 4];
+ int i[NUM];
+ } dst, src1, src2;
+ int i, sign = 1;
+ int value;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.i[i] = i * i * sign;
+ src2.i[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 4)
+ dst.x[i / 4] = _mm_mullo_epi32 (src1.x[i / 4], src2.x[i / 4]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ value = src1.i[i] * src2.i[i];
+ if (value != dst.i[i])
+ abort ();
+ }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
b/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
new file mode 100644
index 000000000000..f6264e5a1083
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
@@ -0,0 +1,18 @@
+#define NO_WARN_X86_INTRINSICS 1
+
+static void sse4_2_test (void);
+
+static void
+__attribute__ ((noinline))
+do_test (void)
+{
+ sse4_2_test ();
+}
+
+int
+main ()
+{
+ do_test ();
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
b/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
new file mode 100644
index 000000000000..4bfbad885b30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_2_test
+#endif
+
+#include CHECK_H
+
+#include <nmmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+ union
+ {
+ __m128i x[NUM / 2];
+ long long ll[NUM];
+ } dst, src1, src2;
+ int i, sign = 1;
+ long long is_eq;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.ll[i] = i * i * sign;
+ src2.ll[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 2)
+ dst.x[i / 2] = _mm_cmpgt_epi64 (src1.x[i / 2], src2.x[i / 2]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ is_eq = src1.ll[i] > src2.ll[i] ? 0xFFFFFFFFFFFFFFFFLL : 0LL;
+ if (is_eq != dst.ll[i])
+ abort ();
+ }
+}