[PATCH] D46881: [X86][CET] Changing -fcf-protection behavior to comply with gcc (clang part)

2018-05-15 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky created this revision.
mike.dvoretsky added a reviewer: craig.topper.
Herald added a subscriber: cfe-commits.

This patch aims to match the changes introduced in gcc by 
https://gcc.gnu.org/ml/gcc-cvs/2018-04/msg00534.html. The -mibt feature flag is 
being removed, and the -fcf-protection option now also defines a __CET__ macro 
and causes errors when used on non-X86 targets, while X86 targets no longer 
check for -mibt and -mshstk to determine if -fcf-protection is supported. 
-mshstk is now used only to determine availability of shadow stack intrinsics.

Comes with an LLVM patch.


Repository:
  rC Clang

https://reviews.llvm.org/D46881

Files:
  clang/docs/ClangCommandLineReference.rst
  clang/include/clang/Basic/DiagnosticCommonKinds.td
  clang/include/clang/Basic/TargetInfo.h
  clang/include/clang/Driver/Options.td
  clang/lib/Basic/TargetInfo.cpp
  clang/lib/Basic/Targets/X86.cpp
  clang/lib/Basic/Targets/X86.h
  clang/lib/Frontend/CompilerInvocation.cpp
  clang/test/CodeGen/attributes.c
  clang/test/CodeGen/builtins-x86.c
  clang/test/CodeGen/x86-cf-protection.c
  clang/test/Driver/x86-target-features.c
  clang/test/Preprocessor/x86_target_features.c
  clang/test/Sema/attr-nocf_check.c
  clang/test/Sema/attr-nocf_check.cpp

Index: clang/test/Sema/attr-nocf_check.cpp
===
--- clang/test/Sema/attr-nocf_check.cpp
+++ clang/test/Sema/attr-nocf_check.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple=i386-unknown-unknown -verify -fcf-protection=branch -target-feature +ibt -std=c++11 -fsyntax-only %s
+// RUN: %clang_cc1 -triple=i386-unknown-unknown -verify -fcf-protection=branch -std=c++11 -fsyntax-only %s
 
 // Function pointer definition.
 [[gnu::nocf_check]] typedef void (*FuncPointerWithNoCfCheck)(void); // no-warning
Index: clang/test/Sema/attr-nocf_check.c
===
--- clang/test/Sema/attr-nocf_check.c
+++ clang/test/Sema/attr-nocf_check.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -verify -fcf-protection=branch -target-feature +ibt -fsyntax-only %s
+// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -verify -fcf-protection=branch -fsyntax-only %s
 
 // Function pointer definition.
 typedef void (*FuncPointerWithNoCfCheck)(void) __attribute__((nocf_check)); // no-warning
Index: clang/test/Preprocessor/x86_target_features.c
===
--- clang/test/Preprocessor/x86_target_features.c
+++ clang/test/Preprocessor/x86_target_features.c
@@ -380,10 +380,6 @@
 
 // SHSTK: #define __SHSTK__ 1
 
-// RUN: %clang -target i386-unknown-unknown -mibt -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=IBT %s
-
-// IBT: #define __IBT__ 1
-
 // RUN: %clang -target i386-unknown-unknown -march=atom -mrdseed -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=RDSEED %s
 
 // RDSEED: #define __RDSEED__ 1
Index: clang/test/Driver/x86-target-features.c
===
--- clang/test/Driver/x86-target-features.c
+++ clang/test/Driver/x86-target-features.c
@@ -80,11 +80,6 @@
 // CETSS: "-target-feature" "+shstk"
 // NO-CETSS: "-target-feature" "-shstk"
 
-// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mibt %s -### -o %t.o 2>&1 | FileCheck -check-prefix=CETIBT %s
-// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-ibt %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-CETIBT %s
-// CETIBT: "-target-feature" "+ibt"
-// NO-CETIBT: "-target-feature" "-ibt"
-
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -msgx %s -### -o %t.o 2>&1 | FileCheck -check-prefix=SGX %s
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-sgx %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-SGX %s
 // SGX: "-target-feature" "+sgx"
Index: clang/test/CodeGen/x86-cf-protection.c
===
--- clang/test/CodeGen/x86-cf-protection.c
+++ clang/test/CodeGen/x86-cf-protection.c
@@ -1,6 +1,8 @@
-// RUN: not %clang_cc1 -fsyntax-only -S -o %t -triple i386-unknown-unknown -fcf-protection=return %s 2>&1 | FileCheck %s --check-prefix=RETURN
-// RUN: not %clang_cc1 -fsyntax-only -S -o %t -triple i386-unknown-unknown -fcf-protection=branch %s 2>&1 | FileCheck %s --check-prefix=BRANCH
+// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=return %s | FileCheck %s --check-prefix=RETURN
+// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=branch %s | FileCheck %s --check-prefix=BRANCH
+// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=full %s   | FileCheck %s --check-prefix=FULL
 
-// RETURN: error: option 'cf-protection=return' cannot be specified without '-mshstk'
-// BRANCH: error: option 'cf-protection=branch' cannot be specified without '-mibt'
+// RETURN: #define __CET__ 2
+// BRANCH: #define __CET__

[PATCH] D46881: [X86][CET] Changing -fcf-protection behavior to comply with gcc (clang part)

2018-05-17 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky updated this revision to Diff 147292.
mike.dvoretsky added a comment.

Removed the unused HasIBT variable declaration from X86.h.


https://reviews.llvm.org/D46881

Files:
  clang/docs/ClangCommandLineReference.rst
  clang/include/clang/Basic/DiagnosticCommonKinds.td
  clang/include/clang/Basic/TargetInfo.h
  clang/include/clang/Driver/Options.td
  clang/lib/Basic/TargetInfo.cpp
  clang/lib/Basic/Targets/X86.cpp
  clang/lib/Basic/Targets/X86.h
  clang/lib/Frontend/CompilerInvocation.cpp
  clang/test/CodeGen/attributes.c
  clang/test/CodeGen/builtins-x86.c
  clang/test/CodeGen/x86-cf-protection.c
  clang/test/Driver/x86-target-features.c
  clang/test/Preprocessor/x86_target_features.c
  clang/test/Sema/attr-nocf_check.c
  clang/test/Sema/attr-nocf_check.cpp

Index: clang/test/Sema/attr-nocf_check.cpp
===
--- clang/test/Sema/attr-nocf_check.cpp
+++ clang/test/Sema/attr-nocf_check.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple=i386-unknown-unknown -verify -fcf-protection=branch -target-feature +ibt -std=c++11 -fsyntax-only %s
+// RUN: %clang_cc1 -triple=i386-unknown-unknown -verify -fcf-protection=branch -std=c++11 -fsyntax-only %s
 
 // Function pointer definition.
 [[gnu::nocf_check]] typedef void (*FuncPointerWithNoCfCheck)(void); // no-warning
Index: clang/test/Sema/attr-nocf_check.c
===
--- clang/test/Sema/attr-nocf_check.c
+++ clang/test/Sema/attr-nocf_check.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -verify -fcf-protection=branch -target-feature +ibt -fsyntax-only %s
+// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -verify -fcf-protection=branch -fsyntax-only %s
 
 // Function pointer definition.
 typedef void (*FuncPointerWithNoCfCheck)(void) __attribute__((nocf_check)); // no-warning
Index: clang/test/Preprocessor/x86_target_features.c
===
--- clang/test/Preprocessor/x86_target_features.c
+++ clang/test/Preprocessor/x86_target_features.c
@@ -380,10 +380,6 @@
 
 // SHSTK: #define __SHSTK__ 1
 
-// RUN: %clang -target i386-unknown-unknown -mibt -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=IBT %s
-
-// IBT: #define __IBT__ 1
-
 // RUN: %clang -target i386-unknown-unknown -march=atom -mrdseed -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=RDSEED %s
 
 // RDSEED: #define __RDSEED__ 1
Index: clang/test/Driver/x86-target-features.c
===
--- clang/test/Driver/x86-target-features.c
+++ clang/test/Driver/x86-target-features.c
@@ -80,11 +80,6 @@
 // CETSS: "-target-feature" "+shstk"
 // NO-CETSS: "-target-feature" "-shstk"
 
-// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mibt %s -### -o %t.o 2>&1 | FileCheck -check-prefix=CETIBT %s
-// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-ibt %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-CETIBT %s
-// CETIBT: "-target-feature" "+ibt"
-// NO-CETIBT: "-target-feature" "-ibt"
-
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -msgx %s -### -o %t.o 2>&1 | FileCheck -check-prefix=SGX %s
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-sgx %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-SGX %s
 // SGX: "-target-feature" "+sgx"
Index: clang/test/CodeGen/x86-cf-protection.c
===
--- clang/test/CodeGen/x86-cf-protection.c
+++ clang/test/CodeGen/x86-cf-protection.c
@@ -1,6 +1,8 @@
-// RUN: not %clang_cc1 -fsyntax-only -S -o %t -triple i386-unknown-unknown -fcf-protection=return %s 2>&1 | FileCheck %s --check-prefix=RETURN
-// RUN: not %clang_cc1 -fsyntax-only -S -o %t -triple i386-unknown-unknown -fcf-protection=branch %s 2>&1 | FileCheck %s --check-prefix=BRANCH
+// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=return %s | FileCheck %s --check-prefix=RETURN
+// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=branch %s | FileCheck %s --check-prefix=BRANCH
+// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=full %s   | FileCheck %s --check-prefix=FULL
 
-// RETURN: error: option 'cf-protection=return' cannot be specified without '-mshstk'
-// BRANCH: error: option 'cf-protection=branch' cannot be specified without '-mibt'
+// RETURN: #define __CET__ 2
+// BRANCH: #define __CET__ 1
+// FULL: #define __CET__ 3
 void foo() {}
Index: clang/test/CodeGen/builtins-x86.c
===
--- clang/test/CodeGen/builtins-x86.c
+++ clang/test/CodeGen/builtins-x86.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -DUSE_64 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +clzero -target-fea

[PATCH] D45202: [X86] Replacing X86-specific floor and ceil vector intrinsics with generic LLVM intrinsics

2018-06-01 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky updated this revision to Diff 149484.
mike.dvoretsky added a comment.

Changed the scalar intrinsic lowering to work via extract-insert. 
https://reviews.llvm.org/D45203 contains tests for folding the resulting IR 
patterns.


https://reviews.llvm.org/D45202

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/avx-builtins.c
  clang/test/CodeGen/avx512f-builtins.c
  clang/test/CodeGen/sse41-builtins.c

Index: clang/test/CodeGen/sse41-builtins.c
===
--- clang/test/CodeGen/sse41-builtins.c
+++ clang/test/CodeGen/sse41-builtins.c
@@ -44,25 +44,31 @@
 
 __m128d test_mm_ceil_pd(__m128d x) {
   // CHECK-LABEL: test_mm_ceil_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2)
+  // CHECK: @llvm.ceil.v2f64
+  // CHECK-NOT: select
   return _mm_ceil_pd(x);
 }
 
 __m128 test_mm_ceil_ps(__m128 x) {
   // CHECK-LABEL: test_mm_ceil_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2)
+  // CHECK: @llvm.ceil.v4f32
+  // CHECK-NOT: select
   return _mm_ceil_ps(x);
 }
 
 __m128d test_mm_ceil_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_ceil_sd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2)
+  // CHECK: extractelement
+  // CHECK: @llvm.ceil.f64
+  // CHECK: insertelement
   return _mm_ceil_sd(x, y);
 }
 
 __m128 test_mm_ceil_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_ceil_ss
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2)
+  // CHECK: extractelement
+  // CHECK: @llvm.ceil.f32
+  // CHECK: insertelement
   return _mm_ceil_ss(x, y);
 }
 
@@ -196,25 +202,31 @@
 
 __m128d test_mm_floor_pd(__m128d x) {
   // CHECK-LABEL: test_mm_floor_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1)
+  // CHECK: @llvm.floor.v2f64
+  // CHECK-NOT: select
   return _mm_floor_pd(x);
 }
 
 __m128 test_mm_floor_ps(__m128 x) {
   // CHECK-LABEL: test_mm_floor_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1)
+  // CHECK: @llvm.floor.v4f32
+  // CHECK-NOT: select
   return _mm_floor_ps(x);
 }
 
 __m128d test_mm_floor_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_floor_sd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1)
+  // CHECK: extractelement
+  // CHECK: @llvm.floor.f64
+  // CHECK: insertelement
   return _mm_floor_sd(x, y);
 }
 
 __m128 test_mm_floor_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_floor_ss
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1)
+  // CHECK: extractelement
+  // CHECK: @llvm.floor.f32
+  // CHECK: insertelement
   return _mm_floor_ss(x, y);
 }
 
Index: clang/test/CodeGen/avx512f-builtins.c
===
--- clang/test/CodeGen/avx512f-builtins.c
+++ clang/test/CodeGen/avx512f-builtins.c
@@ -7565,46 +7565,98 @@
   return _mm512_min_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
+__m512 test_mm512_floor_ps(__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_floor_ps
+  // CHECK: @llvm.floor.v16f32
+  // CHECK-NOT: select
+  return _mm512_floor_ps(__A);
+}
+
+__m512d test_mm512_floor_pd(__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_floor_pd
+  // CHECK: @llvm.floor.v8f64
+  // CHECK-NOT: select
+  return _mm512_floor_pd(__A);
+}
+
 __m512 test_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_floor_ps 
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  // CHECK-LABEL: @test_mm512_mask_floor_ps
+  // CHECK: @llvm.floor.v16f32
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_floor_ps (__W,__U,__A);
 }
 
 __m512d test_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_floor_pd 
-  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  // CHECK-LABEL: @test_mm512_mask_floor_pd
+  // CHECK: @llvm.floor.v8f64
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_floor_pd (__W,__U,__A);
 }
 
+__m512 test_mm512_ceil_ps(__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_ceil_ps
+  // CHECK: @llvm.ceil.v16f32
+  // CHECK-NOT: select
+  return _mm512_ceil_ps(__A);
+}
+
+__m512d test_mm512_ceil_pd(__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_ceil_pd
+  // CHECK: @llvm.ceil.v8f64
+  // CHECK-NOT: select
+  return _mm512_ceil_pd(__A);
+}
+
 __m512 test_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_ceil_ps 
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  // CHECK-LABEL: @test_mm512_mask_ceil_ps
+  // CHECK: @llvm.ceil.v16f32
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_ceil_ps (__W,__U,__A);
 }

[PATCH] D45720: [X86] Lowering PACK*S (pack with saturation) intrinsics to native IR (clang side)

2018-06-05 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky abandoned this revision.
mike.dvoretsky added a comment.

Closing this due to failure of https://reviews.llvm.org/D45721.


https://reviews.llvm.org/D45720



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D45722: [X86] Lowering SAD (sum of absolute differences) intrinsics to native IR (clang side)

2018-06-05 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky abandoned this revision.
mike.dvoretsky added a comment.

Closing this due to failure of https://reviews.llvm.org/D45723.


https://reviews.llvm.org/D45722



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D45202: [X86] Replacing X86-specific floor and ceil vector intrinsics with generic LLVM intrinsics

2018-06-15 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky abandoned this revision.
mike.dvoretsky added a comment.

Abandoning this due to https://reviews.llvm.org/D48067 being accepted instead.


https://reviews.llvm.org/D45202



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D45720: [X86] Lowering PACK*S (pack with saturation) intrinsics to native IR (clang side)

2018-04-17 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky created this revision.
mike.dvoretsky added reviewers: craig.topper, spatel.
Herald added a subscriber: cfe-commits.

This patch lowers the X86 vector packing with saturation intrinsics to native 
LLVM IR. Comes with an LLVM patch.


Repository:
  rC Clang

https://reviews.llvm.org/D45720

Files:
  lib/CodeGen/CGBuiltin.cpp
  test/CodeGen/avx2-builtins.c
  test/CodeGen/avx512bw-builtins.c
  test/CodeGen/avx512vlbw-builtins.c
  test/CodeGen/sse2-builtins.c
  test/CodeGen/sse41-builtins.c

Index: test/CodeGen/sse41-builtins.c
===
--- test/CodeGen/sse41-builtins.c
+++ test/CodeGen/sse41-builtins.c
@@ -328,7 +328,12 @@
 
 __m128i test_mm_packus_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_packus_epi32
-  // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, zeroinitializer
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16>
   return _mm_packus_epi32(x, y);
 }
 
Index: test/CodeGen/sse2-builtins.c
===
--- test/CodeGen/sse2-builtins.c
+++ test/CodeGen/sse2-builtins.c
@@ -869,19 +869,34 @@
 
 __m128i test_mm_packs_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_packs_epi16
-  // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: %{{.*}} = shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> 
+  // CHECK: %{{.*}} = icmp slt <16 x i16> %{{.*}}, 
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> 
+  // CHECK: %{{.*}} = icmp sgt <16 x i16> %{{.*}}, 
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> 
+  // CHECK: %{{.*}} = trunc <16 x i16> %{{.*}} to <16 x i8>
   return _mm_packs_epi16(A, B);
 }
 
 __m128i test_mm_packs_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_packs_epi32
-  // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16>
   return _mm_packs_epi32(A, B);
 }
 
 __m128i test_mm_packus_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_packus_epi16
-  // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: %{{.*}} = shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> 
+  // CHECK: %{{.*}} = icmp slt <16 x i16> %{{.*}}, 
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> 
+  // CHECK: %{{.*}} = icmp sgt <16 x i16> %{{.*}}, zeroinitializer
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> zeroinitializer
+  // CHECK: %{{.*}} = trunc <16 x i16> %{{.*}} to <16 x i8>
   return _mm_packus_epi16(A, B);
 }
 
Index: test/CodeGen/avx512vlbw-builtins.c
===
--- test/CodeGen/avx512vlbw-builtins.c
+++ test/CodeGen/avx512vlbw-builtins.c
@@ -970,105 +970,185 @@
 
 __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_packs_epi32
-  // CHECK: @llvm.x86.sse2.packssdw
+  // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_packs_epi32(__M,__A,__B); 
 }
 __m128i test_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A,  __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_packs_epi32
-  // CHECK: @llvm.x86.sse2.packssdw
+  // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16>
   // 

[PATCH] D45722: [X86] Lowering SAD (sum of absolute differences) intrinsics to native IR (clang side)

2018-04-17 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky created this revision.
mike.dvoretsky added reviewers: craig.topper, spatel.
Herald added a subscriber: cfe-commits.

This patch lowers the SAD intrinsics to native LLVM IR. Comes with an LLVM 
patch.


Repository:
  rC Clang

https://reviews.llvm.org/D45722

Files:
  lib/CodeGen/CGBuiltin.cpp
  test/CodeGen/avx2-builtins.c
  test/CodeGen/avx512bw-builtins.c
  test/CodeGen/sse2-builtins.c

Index: test/CodeGen/sse2-builtins.c
===
--- test/CodeGen/sse2-builtins.c
+++ test/CodeGen/sse2-builtins.c
@@ -893,7 +893,33 @@
 
 __m128i test_mm_sad_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sad_epu8
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: %{{.*}} = icmp ugt <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
   return _mm_sad_epu8(A, B);
 }
 
Index: test/CodeGen/avx512bw-builtins.c
===
--- test/CodeGen/avx512bw-builtins.c
+++ test/CodeGen/avx512bw-builtins.c
@@ -1945,7 +1945,33 @@
 
 __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_sad_epu8
-  // CHECK: @llvm.x86.avx512.psad.bw.512
+  // CHECK: %{{.*}} = icmp ugt <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_sad_epu8(__A, __B); 
 }
 
Index: test/CodeGen/avx2-builtins.c
===
--- test/CodeGen/avx2-builtins.c
+++ test/CodeGen/avx2-builtins.c
@@ -943,7 +943,33 @@
 
 __m256i test_mm256_sad_epu8(__m256i x, __m256i y) {
   // CHECK-LABEL: 

[PATCH] D45720: [X86] Lowering PACK*S (pack with saturation) intrinsics to native IR (clang side)

2018-04-18 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky added inline comments.



Comment at: lib/CodeGen/CGBuiltin.cpp:8443
+  Value *MaxVec = llvm::ConstantInt::get(RTy, MaxVal);
+  Res = EmitX86MinMax(CGF, ICmpInst::ICMP_SLT, {Res, MaxVec});
+  Res = EmitX86MinMax(CGF, ICmpInst::ICMP_SGT, {Res, MinVec});

craig.topper wrote:
> Why arent' these unsigned compares for Unsigned?
The compares are signed on purpose. PACKUS assumes that the input elements are 
signed, then uses unsigned saturation. So, for instance, an 0x value must 
be evaluated as -1 and saturated to 0, rather than to 0xff as it would be with 
unsigned comparisons.


Repository:
  rC Clang

https://reviews.llvm.org/D45720



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D45720: [X86] Lowering PACK*S (pack with saturation) intrinsics to native IR (clang side)

2018-04-18 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky updated this revision to Diff 142899.
mike.dvoretsky added a comment.

Updated per comments.


https://reviews.llvm.org/D45720

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/avx2-builtins.c
  clang/test/CodeGen/avx512bw-builtins.c
  clang/test/CodeGen/avx512vlbw-builtins.c
  clang/test/CodeGen/sse2-builtins.c
  clang/test/CodeGen/sse41-builtins.c

Index: clang/test/CodeGen/sse41-builtins.c
===
--- clang/test/CodeGen/sse41-builtins.c
+++ clang/test/CodeGen/sse41-builtins.c
@@ -328,7 +328,12 @@
 
 __m128i test_mm_packus_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_packus_epi32
-  // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, zeroinitializer
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16>
   return _mm_packus_epi32(x, y);
 }
 
Index: clang/test/CodeGen/sse2-builtins.c
===
--- clang/test/CodeGen/sse2-builtins.c
+++ clang/test/CodeGen/sse2-builtins.c
@@ -869,19 +869,34 @@
 
 __m128i test_mm_packs_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_packs_epi16
-  // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: %{{.*}} = shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> 
+  // CHECK: %{{.*}} = icmp slt <16 x i16> %{{.*}}, 
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> 
+  // CHECK: %{{.*}} = icmp sgt <16 x i16> %{{.*}}, 
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> 
+  // CHECK: %{{.*}} = trunc <16 x i16> %{{.*}} to <16 x i8>
   return _mm_packs_epi16(A, B);
 }
 
 __m128i test_mm_packs_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_packs_epi32
-  // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16>
   return _mm_packs_epi32(A, B);
 }
 
 __m128i test_mm_packus_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_packus_epi16
-  // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: %{{.*}} = shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> 
+  // CHECK: %{{.*}} = icmp slt <16 x i16> %{{.*}}, 
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> 
+  // CHECK: %{{.*}} = icmp sgt <16 x i16> %{{.*}}, zeroinitializer
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> zeroinitializer
+  // CHECK: %{{.*}} = trunc <16 x i16> %{{.*}} to <16 x i8>
   return _mm_packus_epi16(A, B);
 }
 
Index: clang/test/CodeGen/avx512vlbw-builtins.c
===
--- clang/test/CodeGen/avx512vlbw-builtins.c
+++ clang/test/CodeGen/avx512vlbw-builtins.c
@@ -970,105 +970,185 @@
 
 __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_packs_epi32
-  // CHECK: @llvm.x86.sse2.packssdw
+  // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_packs_epi32(__M,__A,__B); 
 }
 __m128i test_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A,  __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_packs_epi32
-  // CHECK: @llvm.x86.sse2.packssdw
+  // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   r

[PATCH] D45722: [X86] Lowering SAD (sum of absolute differences) intrinsics to native IR (clang side)

2018-04-18 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky updated this revision to Diff 142914.
mike.dvoretsky added a comment.

Updated per comments.


https://reviews.llvm.org/D45722

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/avx2-builtins.c
  clang/test/CodeGen/avx512bw-builtins.c
  clang/test/CodeGen/sse2-builtins.c

Index: clang/test/CodeGen/sse2-builtins.c
===
--- clang/test/CodeGen/sse2-builtins.c
+++ clang/test/CodeGen/sse2-builtins.c
@@ -893,7 +893,33 @@
 
 __m128i test_mm_sad_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sad_epu8
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: %{{.*}} = icmp ugt <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
   return _mm_sad_epu8(A, B);
 }
 
Index: clang/test/CodeGen/avx512bw-builtins.c
===
--- clang/test/CodeGen/avx512bw-builtins.c
+++ clang/test/CodeGen/avx512bw-builtins.c
@@ -1945,7 +1945,33 @@
 
 __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_sad_epu8
-  // CHECK: @llvm.x86.avx512.psad.bw.512
+  // CHECK: %{{.*}} = icmp ugt <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_sad_epu8(__A, __B); 
 }
 
Index: clang/test/CodeGen/avx2-builtins.c
===
--- clang/test/CodeGen/avx2-builtins.c
+++ clang/test/CodeGen/avx2-builtins.c
@@ -943,7 +943,33 @@
 
 __m256i test_mm256_sad_epu8(__m256i x, __m256i y) {
   // CHECK-LABEL: test_mm256_sad_epu8
-  // CHECK: call <4 x i64> @llvm.

[PATCH] D45722: [X86] Lowering SAD (sum of absolute differences) intrinsics to native IR (clang side)

2018-04-24 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky updated this revision to Diff 143715.
mike.dvoretsky added a comment.

Updated per comments.


https://reviews.llvm.org/D45722

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/avx2-builtins.c
  clang/test/CodeGen/avx512bw-builtins.c
  clang/test/CodeGen/sse2-builtins.c

Index: clang/test/CodeGen/sse2-builtins.c
===
--- clang/test/CodeGen/sse2-builtins.c
+++ clang/test/CodeGen/sse2-builtins.c
@@ -893,7 +893,33 @@
 
 __m128i test_mm_sad_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sad_epu8
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: %{{.*}} = icmp ugt <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
   return _mm_sad_epu8(A, B);
 }
 
Index: clang/test/CodeGen/avx512bw-builtins.c
===
--- clang/test/CodeGen/avx512bw-builtins.c
+++ clang/test/CodeGen/avx512bw-builtins.c
@@ -1945,7 +1945,33 @@
 
 __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_sad_epu8
-  // CHECK: @llvm.x86.avx512.psad.bw.512
+  // CHECK: %{{.*}} = icmp ugt <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_sad_epu8(__A, __B); 
 }
 
Index: clang/test/CodeGen/avx2-builtins.c
===
--- clang/test/CodeGen/avx2-builtins.c
+++ clang/test/CodeGen/avx2-builtins.c
@@ -943,7 +943,33 @@
 
 __m256i test_mm256_sad_epu8(__m256i x, __m256i y) {
   // CHECK-LABEL: test_mm256_sad_epu8
- 

[PATCH] D45722: [X86] Lowering SAD (sum of absolute differences) intrinsics to native IR (clang side)

2018-04-25 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky updated this revision to Diff 143893.
mike.dvoretsky marked 4 inline comments as done.
mike.dvoretsky added a subscriber: ashlykov.
mike.dvoretsky added a comment.

Updated per comments.


https://reviews.llvm.org/D45722

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/avx2-builtins.c
  clang/test/CodeGen/avx512bw-builtins.c
  clang/test/CodeGen/sse2-builtins.c

Index: clang/test/CodeGen/sse2-builtins.c
===
--- clang/test/CodeGen/sse2-builtins.c
+++ clang/test/CodeGen/sse2-builtins.c
@@ -893,7 +893,33 @@
 
 __m128i test_mm_sad_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sad_epu8
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: %{{.*}} = icmp ugt <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> 
+  // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64>
+  // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}}
   return _mm_sad_epu8(A, B);
 }
 
Index: clang/test/CodeGen/avx512bw-builtins.c
===
--- clang/test/CodeGen/avx512bw-builtins.c
+++ clang/test/CodeGen/avx512bw-builtins.c
@@ -1945,7 +1945,33 @@
 
 __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_sad_epu8
-  // CHECK: @llvm.x86.avx512.psad.bw.512
+  // CHECK: %{{.*}} = icmp ugt <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64>
+  // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_sad_epu8(__A, __B); 
 }
 
Index: clang/test/CodeGen/avx2-builtins.c
===
--- clang/test/CodeGen/avx2-builtins.c
+++ clang/test/CodeGen/avx2-builtins.c
@@ -943,7 +943,33 @@
 

[PATCH] D45720: [X86] Lowering PACK*S (pack with saturation) intrinsics to native IR (clang side)

2018-04-26 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky updated this revision to Diff 144126.
mike.dvoretsky added a comment.

Changed the shuffle mask emission code to match https://reviews.llvm.org/D45721.


https://reviews.llvm.org/D45720

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/avx2-builtins.c
  clang/test/CodeGen/avx512bw-builtins.c
  clang/test/CodeGen/avx512vlbw-builtins.c
  clang/test/CodeGen/sse2-builtins.c
  clang/test/CodeGen/sse41-builtins.c

Index: clang/test/CodeGen/sse41-builtins.c
===
--- clang/test/CodeGen/sse41-builtins.c
+++ clang/test/CodeGen/sse41-builtins.c
@@ -328,7 +328,12 @@
 
 __m128i test_mm_packus_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_packus_epi32
-  // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, zeroinitializer
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16>
   return _mm_packus_epi32(x, y);
 }
 
Index: clang/test/CodeGen/sse2-builtins.c
===
--- clang/test/CodeGen/sse2-builtins.c
+++ clang/test/CodeGen/sse2-builtins.c
@@ -869,19 +869,34 @@
 
 __m128i test_mm_packs_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_packs_epi16
-  // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: %{{.*}} = shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> 
+  // CHECK: %{{.*}} = icmp slt <16 x i16> %{{.*}}, 
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> 
+  // CHECK: %{{.*}} = icmp sgt <16 x i16> %{{.*}}, 
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> 
+  // CHECK: %{{.*}} = trunc <16 x i16> %{{.*}} to <16 x i8>
   return _mm_packs_epi16(A, B);
 }
 
 __m128i test_mm_packs_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_packs_epi32
-  // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16>
   return _mm_packs_epi32(A, B);
 }
 
 __m128i test_mm_packus_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_packus_epi16
-  // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: %{{.*}} = shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> 
+  // CHECK: %{{.*}} = icmp slt <16 x i16> %{{.*}}, 
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> 
+  // CHECK: %{{.*}} = icmp sgt <16 x i16> %{{.*}}, zeroinitializer
+  // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> zeroinitializer
+  // CHECK: %{{.*}} = trunc <16 x i16> %{{.*}} to <16 x i8>
   return _mm_packus_epi16(A, B);
 }
 
Index: clang/test/CodeGen/avx512vlbw-builtins.c
===
--- clang/test/CodeGen/avx512vlbw-builtins.c
+++ clang/test/CodeGen/avx512vlbw-builtins.c
@@ -970,105 +970,185 @@
 
 __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_packs_epi32
-  // CHECK: @llvm.x86.sse2.packssdw
+  // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_packs_epi32(__M,__A,__B); 
 }
 __m128i test_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A,  __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_packs_epi32
-  // CHECK: @llvm.x86.sse2.packssdw
+  // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, 
+  // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> 
+  // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16>
   // CHECK: select 

[PATCH] D48712: [X86] Lowering integer truncation intrinsics to native IR

2018-06-28 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky created this revision.
mike.dvoretsky added a reviewer: craig.topper.
Herald added a subscriber: cfe-commits.

This patch lowers the _mm[256|512]_cvtepi{64|32|16}_epi{32|16|8} intrinsics to 
native IR in cases where the result's length is less than 128 bits.

The resulting IR is folded into VPMOV instructions in 
https://reviews.llvm.org/D46957, with the exception of _mm_cvtepi64_epi8, where 
a PSHUFB instruction is currently produced instead.


Repository:
  rC Clang

https://reviews.llvm.org/D48712

Files:
  clang/lib/Headers/avx512vlbwintrin.h
  clang/lib/Headers/avx512vlintrin.h
  clang/test/CodeGen/avx512vl-builtins.c
  clang/test/CodeGen/avx512vlbw-builtins.c

Index: clang/test/CodeGen/avx512vlbw-builtins.c
===
--- clang/test/CodeGen/avx512vlbw-builtins.c
+++ clang/test/CodeGen/avx512vlbw-builtins.c
@@ -1792,7 +1792,8 @@
 
 __m128i test_mm_cvtepi16_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi16_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.wb.128
+  // CHECK: trunc <8 x i16> %{{.*}} to <8 x i8>
+  // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> 
   return _mm_cvtepi16_epi8(__A); 
 }
 
Index: clang/test/CodeGen/avx512vl-builtins.c
===
--- clang/test/CodeGen/avx512vl-builtins.c
+++ clang/test/CodeGen/avx512vl-builtins.c
@@ -6974,7 +6974,8 @@
 
 __m128i test_mm_cvtepi32_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.128
+  // CHECK: trunc <4 x i32> %{{.*}} to <4 x i8>
+  // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> 
   return _mm_cvtepi32_epi8(__A); 
 }
 
@@ -6998,7 +6999,8 @@
 
 __m128i test_mm256_cvtepi32_epi8(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.256
+  // CHECK: trunc <8 x i32> %{{.*}} to <8 x i8>
+  // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> 
   return _mm256_cvtepi32_epi8(__A); 
 }
 
@@ -7022,7 +7024,8 @@
 
 __m128i test_mm_cvtepi32_epi16(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.dw.128
+  // CHECK: trunc <4 x i32> %{{.*}} to <4 x i16>
+  // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> 
   return _mm_cvtepi32_epi16(__A); 
 }
 
@@ -7070,7 +7073,8 @@
 
 __m128i test_mm_cvtepi64_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i8>
+  // CHECK: shufflevector <2 x i8> %{{.*}}, <2 x i8> %{{.*}}, <16 x i32> 
   return _mm_cvtepi64_epi8(__A); 
 }
 
@@ -7094,7 +7098,8 @@
 
 __m128i test_mm256_cvtepi64_epi8(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i8>
+  // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> 
   return _mm256_cvtepi64_epi8(__A); 
 }
 
@@ -7118,7 +7123,8 @@
 
 __m128i test_mm_cvtepi64_epi32(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmov.qd.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  // CHECK: shufflevector <2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <4 x i32> 
   return _mm_cvtepi64_epi32(__A); 
 }
 
@@ -7168,7 +7174,8 @@
 
 __m128i test_mm_cvtepi64_epi16(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i16>
+  // CHECK: shufflevector <2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <8 x i32> 
   return _mm_cvtepi64_epi16(__A); 
 }
 
@@ -7192,7 +7199,8 @@
 
 __m128i test_mm256_cvtepi64_epi16(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i16>
+  // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> 
   return _mm256_cvtepi64_epi16(__A); 
 }
 
Index: clang/lib/Headers/avx512vlintrin.h
===
--- clang/lib/Headers/avx512vlintrin.h
+++ clang/lib/Headers/avx512vlintrin.h
@@ -30,6 +30,7 @@
 
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl")))
 
+typedef short __v2hi __attribute__((__vector_size__(4)));
 typedef char __v4qi __attribute__((__vector_size__(4)));
 typedef char __v2qi __attribute__((__vector_size__(2)));
 
@@ -7415,10 +7416,9 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi32_epi8 (__m128i __A)
 {
-  return __builtin_shufflevector(__builtin_convertvector((__v4si)__A, __v4qi),
- (__v4qi) {0, 0, 0, 0},
- 0, 1, 2, 3, 4, 5, 6, 7,
- 7, 7, 7, 7, 7, 7, 7, 7);
+  return (__m128i)__builtin_shufflevector(
+  __builtin_convertvector((__v4si)__A, __v4qi), (__v4q

[PATCH] D48712: [X86] Lowering integer truncation intrinsics to native IR

2018-06-28 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky updated this revision to Diff 153277.
mike.dvoretsky added a comment.

Uploaded the correct diff.


https://reviews.llvm.org/D48712

Files:
  clang/lib/Headers/avx512vlbwintrin.h
  clang/lib/Headers/avx512vlintrin.h
  clang/test/CodeGen/avx512vl-builtins.c
  clang/test/CodeGen/avx512vlbw-builtins.c

Index: clang/test/CodeGen/avx512vlbw-builtins.c
===
--- clang/test/CodeGen/avx512vlbw-builtins.c
+++ clang/test/CodeGen/avx512vlbw-builtins.c
@@ -1792,7 +1792,8 @@
 
 __m128i test_mm_cvtepi16_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi16_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.wb.128
+  // CHECK: trunc <8 x i16> %{{.*}} to <8 x i8>
+  // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> 
   return _mm_cvtepi16_epi8(__A); 
 }
 
Index: clang/test/CodeGen/avx512vl-builtins.c
===
--- clang/test/CodeGen/avx512vl-builtins.c
+++ clang/test/CodeGen/avx512vl-builtins.c
@@ -6974,7 +6974,8 @@
 
 __m128i test_mm_cvtepi32_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.128
+  // CHECK: trunc <4 x i32> %{{.*}} to <4 x i8>
+  // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> 
   return _mm_cvtepi32_epi8(__A); 
 }
 
@@ -6998,7 +6999,8 @@
 
 __m128i test_mm256_cvtepi32_epi8(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.256
+  // CHECK: trunc <8 x i32> %{{.*}} to <8 x i8>
+  // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> 
   return _mm256_cvtepi32_epi8(__A); 
 }
 
@@ -7022,7 +7024,8 @@
 
 __m128i test_mm_cvtepi32_epi16(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.dw.128
+  // CHECK: trunc <4 x i32> %{{.*}} to <4 x i16>
+  // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> 
   return _mm_cvtepi32_epi16(__A); 
 }
 
@@ -7070,7 +7073,8 @@
 
 __m128i test_mm_cvtepi64_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i8>
+  // CHECK: shufflevector <2 x i8> %{{.*}}, <2 x i8> %{{.*}}, <16 x i32> 
   return _mm_cvtepi64_epi8(__A); 
 }
 
@@ -7094,7 +7098,8 @@
 
 __m128i test_mm256_cvtepi64_epi8(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i8>
+  // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> 
   return _mm256_cvtepi64_epi8(__A); 
 }
 
@@ -7118,7 +7123,8 @@
 
 __m128i test_mm_cvtepi64_epi32(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmov.qd.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  // CHECK: shufflevector <2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <4 x i32> 
   return _mm_cvtepi64_epi32(__A); 
 }
 
@@ -7168,7 +7174,8 @@
 
 __m128i test_mm_cvtepi64_epi16(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i16>
+  // CHECK: shufflevector <2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <8 x i32> 
   return _mm_cvtepi64_epi16(__A); 
 }
 
@@ -7192,7 +7199,8 @@
 
 __m128i test_mm256_cvtepi64_epi16(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i16>
+  // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> 
   return _mm256_cvtepi64_epi16(__A); 
 }
 
Index: clang/lib/Headers/avx512vlintrin.h
===
--- clang/lib/Headers/avx512vlintrin.h
+++ clang/lib/Headers/avx512vlintrin.h
@@ -30,6 +30,10 @@
 
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl")))
 
+typedef short __v2hi __attribute__((__vector_size__(4)));
+typedef char __v4qi __attribute__((__vector_size__(4)));
+typedef char __v2qi __attribute__((__vector_size__(2)));
+
 /* Integer compare */
 
 #define _mm_cmpeq_epi32_mask(A, B) \
@@ -7412,9 +7416,9 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi32_epi8 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
-  (__v16qi)_mm_undefined_si128(),
-  (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+  __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
+  2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -7442,9 +7446,10 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm256_cvtepi32_epi8 (__m256i __A)
 {
-  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
-  (__v16qi)_mm_undefined_si128(),
-  (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+   

[PATCH] D48712: [X86] Lowering integer truncation intrinsics to native IR

2018-06-29 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky updated this revision to Diff 153471.
mike.dvoretsky marked 2 inline comments as done.
mike.dvoretsky added a comment.

Updated per comments. Typedefs for intermediate short vectors moved into the 
bodies of the functions using them.


https://reviews.llvm.org/D48712

Files:
  clang/lib/Headers/avx512vlbwintrin.h
  clang/lib/Headers/avx512vlintrin.h
  clang/test/CodeGen/avx512vl-builtins.c
  clang/test/CodeGen/avx512vlbw-builtins.c

Index: clang/test/CodeGen/avx512vlbw-builtins.c
===
--- clang/test/CodeGen/avx512vlbw-builtins.c
+++ clang/test/CodeGen/avx512vlbw-builtins.c
@@ -1792,7 +1792,8 @@
 
 __m128i test_mm_cvtepi16_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi16_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.wb.128
+  // CHECK: trunc <8 x i16> %{{.*}} to <8 x i8>
+  // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> 
   return _mm_cvtepi16_epi8(__A); 
 }
 
Index: clang/test/CodeGen/avx512vl-builtins.c
===
--- clang/test/CodeGen/avx512vl-builtins.c
+++ clang/test/CodeGen/avx512vl-builtins.c
@@ -6974,7 +6974,8 @@
 
 __m128i test_mm_cvtepi32_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.128
+  // CHECK: trunc <4 x i32> %{{.*}} to <4 x i8>
+  // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> 
   return _mm_cvtepi32_epi8(__A); 
 }
 
@@ -6998,7 +6999,8 @@
 
 __m128i test_mm256_cvtepi32_epi8(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.256
+  // CHECK: trunc <8 x i32> %{{.*}} to <8 x i8>
+  // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> 
   return _mm256_cvtepi32_epi8(__A); 
 }
 
@@ -7022,7 +7024,8 @@
 
 __m128i test_mm_cvtepi32_epi16(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.dw.128
+  // CHECK: trunc <4 x i32> %{{.*}} to <4 x i16>
+  // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> 
   return _mm_cvtepi32_epi16(__A); 
 }
 
@@ -7070,7 +7073,8 @@
 
 __m128i test_mm_cvtepi64_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i8>
+  // CHECK: shufflevector <2 x i8> %{{.*}}, <2 x i8> %{{.*}}, <16 x i32> 
   return _mm_cvtepi64_epi8(__A); 
 }
 
@@ -7094,7 +7098,8 @@
 
 __m128i test_mm256_cvtepi64_epi8(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i8>
+  // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> 
   return _mm256_cvtepi64_epi8(__A); 
 }
 
@@ -7118,7 +7123,8 @@
 
 __m128i test_mm_cvtepi64_epi32(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmov.qd.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  // CHECK: shufflevector <2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <4 x i32> 
   return _mm_cvtepi64_epi32(__A); 
 }
 
@@ -7168,7 +7174,8 @@
 
 __m128i test_mm_cvtepi64_epi16(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i16>
+  // CHECK: shufflevector <2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <8 x i32> 
   return _mm_cvtepi64_epi16(__A); 
 }
 
@@ -7192,7 +7199,8 @@
 
 __m128i test_mm256_cvtepi64_epi16(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i16>
+  // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> 
   return _mm256_cvtepi64_epi16(__A); 
 }
 
Index: clang/lib/Headers/avx512vlintrin.h
===
--- clang/lib/Headers/avx512vlintrin.h
+++ clang/lib/Headers/avx512vlintrin.h
@@ -7412,9 +7412,10 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi32_epi8 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
-  (__v16qi)_mm_undefined_si128(),
-  (__mmask8) -1);
+  typedef char __v4qi __attribute__((__vector_size__(4)));
+  return (__m128i)__builtin_shufflevector(
+  __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
+  2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -7442,9 +7443,10 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm256_cvtepi32_epi8 (__m256i __A)
 {
-  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
-  (__v16qi)_mm_undefined_si128(),
-  (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+  __builtin_convertvector((__v8si)__A, __v8qi),
+  (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+  12, 13, 14, 15);
 }
 
 static _

[PATCH] D48712: [X86] Lowering integer truncation intrinsics to native IR

2018-06-29 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky added inline comments.



Comment at: clang/lib/Headers/avx512vlbwintrin.h:1501
+  (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+  12, 13, 14, 15);
 }

RKSimon wrote:
> Are we happy with using illegal types like this? What about flipping the 
> shuffle and convert?
> 
> ```
>   return (__m128i)__builtin_convertvector(
> __builtin_shufflevector((__v8hi)__A,
> (__v8hi){0, 0, 0, 0, 0, 0, 0, 0},
> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
> 11, 12, 13, 14, 15), __v16qi);
> ```
This would bring its own issues, since in the cvtepi64_epi8 cases the inner 
shuffle would produce vectors of 16 64-bit values. There would be no extra 
typedef, but in the back-end these would be split in type legalization, making 
it harder to fold them into VPMOV instructions.


https://reviews.llvm.org/D48712



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D48712: [X86] Lowering integer truncation intrinsics to native IR

2018-07-10 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rC336643: [X86] Lowering integer truncation intrinsics to 
native IR (authored by mike.dvoretsky, committed by ).

Changed prior to commit:
  https://reviews.llvm.org/D48712?vs=153471&id=154765#toc

Repository:
  rC Clang

https://reviews.llvm.org/D48712

Files:
  lib/Headers/avx512vlbwintrin.h
  lib/Headers/avx512vlintrin.h
  test/CodeGen/avx512vl-builtins.c
  test/CodeGen/avx512vlbw-builtins.c

Index: test/CodeGen/avx512vl-builtins.c
===
--- test/CodeGen/avx512vl-builtins.c
+++ test/CodeGen/avx512vl-builtins.c
@@ -8503,7 +8503,8 @@
 
 __m128i test_mm_cvtepi32_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.128
+  // CHECK: trunc <4 x i32> %{{.*}} to <4 x i8>
+  // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> 
   return _mm_cvtepi32_epi8(__A); 
 }
 
@@ -8527,7 +8528,8 @@
 
 __m128i test_mm256_cvtepi32_epi8(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.256
+  // CHECK: trunc <8 x i32> %{{.*}} to <8 x i8>
+  // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> 
   return _mm256_cvtepi32_epi8(__A); 
 }
 
@@ -8551,7 +8553,8 @@
 
 __m128i test_mm_cvtepi32_epi16(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.dw.128
+  // CHECK: trunc <4 x i32> %{{.*}} to <4 x i16>
+  // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> 
   return _mm_cvtepi32_epi16(__A); 
 }
 
@@ -8599,7 +8602,8 @@
 
 __m128i test_mm_cvtepi64_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i8>
+  // CHECK: shufflevector <2 x i8> %{{.*}}, <2 x i8> %{{.*}}, <16 x i32> 
   return _mm_cvtepi64_epi8(__A); 
 }
 
@@ -8623,7 +8627,8 @@
 
 __m128i test_mm256_cvtepi64_epi8(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i8>
+  // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> 
   return _mm256_cvtepi64_epi8(__A); 
 }
 
@@ -8647,7 +8652,8 @@
 
 __m128i test_mm_cvtepi64_epi32(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmov.qd.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32>
+  // CHECK: shufflevector <2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <4 x i32> 
   return _mm_cvtepi64_epi32(__A); 
 }
 
@@ -8697,7 +8703,8 @@
 
 __m128i test_mm_cvtepi64_epi16(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.128
+  // CHECK: trunc <2 x i64> %{{.*}} to <2 x i16>
+  // CHECK: shufflevector <2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <8 x i32> 
   return _mm_cvtepi64_epi16(__A); 
 }
 
@@ -8721,7 +8728,8 @@
 
 __m128i test_mm256_cvtepi64_epi16(__m256i __A) {
   // CHECK-LABEL: @test_mm256_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.256
+  // CHECK: trunc <4 x i64> %{{.*}} to <4 x i16>
+  // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> 
   return _mm256_cvtepi64_epi16(__A); 
 }
 
Index: test/CodeGen/avx512vlbw-builtins.c
===
--- test/CodeGen/avx512vlbw-builtins.c
+++ test/CodeGen/avx512vlbw-builtins.c
@@ -1792,7 +1792,8 @@
 
 __m128i test_mm_cvtepi16_epi8(__m128i __A) {
   // CHECK-LABEL: @test_mm_cvtepi16_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.wb.128
+  // CHECK: trunc <8 x i16> %{{.*}} to <8 x i8>
+  // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> 
   return _mm_cvtepi16_epi8(__A); 
 }
 
Index: lib/Headers/avx512vlintrin.h
===
--- lib/Headers/avx512vlintrin.h
+++ lib/Headers/avx512vlintrin.h
@@ -31,6 +31,10 @@
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(256)))
 
+typedef short __v2hi __attribute__((__vector_size__(4)));
+typedef char __v4qi __attribute__((__vector_size__(4)));
+typedef char __v2qi __attribute__((__vector_size__(2)));
+
 /* Integer compare */
 
 #define _mm_cmpeq_epi32_mask(A, B) \
@@ -7341,9 +7345,9 @@
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_cvtepi32_epi8 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
-  (__v16qi)_mm_undefined_si128(),
-  (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector(
+  __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
+  2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
@@ -7371,9 +73

[PATCH] D45202: [X86] Replacing X86-specific floor and ceil vector intrinsics with generic LLVM intrinsics

2018-04-03 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky created this revision.
mike.dvoretsky added reviewers: craig.topper, spatel, RKSimon.
Herald added a subscriber: cfe-commits.

Currently, X86 floor and ceil intrinsics for vectors are implemented as 
target-specific intrinsics that use the generic rounding instruction of the 
corresponding vector processing feature (ROUND* or VRNDSCALE*). This patch 
replaces those specific cases with calls to target-independent @llvm.floor.* 
and @llvm.ceil.* intrinsics. This doesn't affect the resulting machine code, as 
those intrinsics are lowered to the same instructions, but exposes these 
specific rounding cases to generic optimizations.


Repository:
  rC Clang

https://reviews.llvm.org/D45202

Files:
  include/clang/Basic/BuiltinsX86.def
  lib/CodeGen/CGBuiltin.cpp
  lib/Headers/avx512fintrin.h
  lib/Headers/avxintrin.h
  lib/Headers/smmintrin.h
  test/CodeGen/avx-builtins.c
  test/CodeGen/avx512f-builtins.c
  test/CodeGen/sse41-builtins.c

Index: test/CodeGen/sse41-builtins.c
===
--- test/CodeGen/sse41-builtins.c
+++ test/CodeGen/sse41-builtins.c
@@ -44,25 +44,29 @@
 
 __m128d test_mm_ceil_pd(__m128d x) {
   // CHECK-LABEL: test_mm_ceil_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2)
+  // CHECK: @llvm.ceil.v2f64
+  // CHECK-NOT: select
   return _mm_ceil_pd(x);
 }
 
 __m128 test_mm_ceil_ps(__m128 x) {
   // CHECK-LABEL: test_mm_ceil_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2)
+  // CHECK: @llvm.ceil.v4f32
+  // CHECK-NOT: select
   return _mm_ceil_ps(x);
 }
 
 __m128d test_mm_ceil_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_ceil_sd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2)
+  // CHECK: @llvm.ceil.v2f64
+  // CHECK: select
   return _mm_ceil_sd(x, y);
 }
 
 __m128 test_mm_ceil_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_ceil_ss
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2)
+  // CHECK: @llvm.ceil.v4f32
+  // CHECK: select
   return _mm_ceil_ss(x, y);
 }
 
@@ -196,25 +200,29 @@
 
 __m128d test_mm_floor_pd(__m128d x) {
   // CHECK-LABEL: test_mm_floor_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1)
+  // CHECK: @llvm.floor.v2f64
+  // CHECK-NOT: select
   return _mm_floor_pd(x);
 }
 
 __m128 test_mm_floor_ps(__m128 x) {
   // CHECK-LABEL: test_mm_floor_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1)
+  // CHECK: @llvm.floor.v4f32
+  // CHECK-NOT: select
   return _mm_floor_ps(x);
 }
 
 __m128d test_mm_floor_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_floor_sd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1)
+  // CHECK: @llvm.floor.v2f64
+  // CHECK: select
   return _mm_floor_sd(x, y);
 }
 
 __m128 test_mm_floor_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_floor_ss
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1)
+  // CHECK: @llvm.floor.v4f32
+  // CHECK: select
   return _mm_floor_ss(x, y);
 }
 
Index: test/CodeGen/avx512f-builtins.c
===
--- test/CodeGen/avx512f-builtins.c
+++ test/CodeGen/avx512f-builtins.c
@@ -7485,31 +7485,67 @@
   return _mm512_min_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
+__m512 test_mm512_floor_ps(__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_floor_ps
+  // CHECK: @llvm.floor.v16f32
+  // CHECK-NOT: select
+  return _mm512_floor_ps(__A);
+}
+
+__m512d test_mm512_floor_pd(__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_floor_pd
+  // CHECK: @llvm.floor.v8f64
+  // CHECK-NOT: select
+  return _mm512_floor_pd(__A);
+}
+
 __m512 test_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_floor_ps 
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  // CHECK-LABEL: @test_mm512_mask_floor_ps
+  // CHECK: @llvm.floor.v16f32
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_floor_ps (__W,__U,__A);
 }
 
 __m512d test_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_floor_pd 
-  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  // CHECK-LABEL: @test_mm512_mask_floor_pd
+  // CHECK: @llvm.floor.v8f64
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_floor_pd (__W,__U,__A);
 }
 
+__m512 test_mm512_ceil_ps(__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_ceil_ps
+  // CHECK: @llvm.ceil.v16f32
+  // CHECK-NOT: select
+  return _mm512_ceil_ps(__A);
+}
+
+__m512d test_mm512_ceil_pd(__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_ceil_pd
+  // CHECK: @llvm.ceil.v8f64
+  // CHECK-NOT: select
+  return _mm512_ceil_pd(__A);
+}
+
 __m512 test_

[PATCH] D45202: [X86] Replacing X86-specific floor and ceil vector intrinsics with generic LLVM intrinsics

2018-04-04 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky updated this revision to Diff 140972.
mike.dvoretsky edited the summary of this revision.
mike.dvoretsky added a comment.

On suggestion from @craig.topper moved all lowering to CGBuiltin.cpp with no 
new builtins added. Instead the existing builtins are lowered if their 
immediate values correspond to generic ceil and floor operations. 
https://reviews.llvm.org/D45203 is now required to enable transformations.


https://reviews.llvm.org/D45202

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/avx-builtins.c
  clang/test/CodeGen/avx512f-builtins.c
  clang/test/CodeGen/sse41-builtins.c

Index: clang/test/CodeGen/sse41-builtins.c
===
--- clang/test/CodeGen/sse41-builtins.c
+++ clang/test/CodeGen/sse41-builtins.c
@@ -44,25 +44,29 @@
 
 __m128d test_mm_ceil_pd(__m128d x) {
   // CHECK-LABEL: test_mm_ceil_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2)
+  // CHECK: @llvm.ceil.v2f64
+  // CHECK-NOT: select
   return _mm_ceil_pd(x);
 }
 
 __m128 test_mm_ceil_ps(__m128 x) {
   // CHECK-LABEL: test_mm_ceil_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2)
+  // CHECK: @llvm.ceil.v4f32
+  // CHECK-NOT: select
   return _mm_ceil_ps(x);
 }
 
 __m128d test_mm_ceil_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_ceil_sd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2)
+  // CHECK: @llvm.ceil.v2f64
+  // CHECK: select
   return _mm_ceil_sd(x, y);
 }
 
 __m128 test_mm_ceil_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_ceil_ss
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2)
+  // CHECK: @llvm.ceil.v4f32
+  // CHECK: select
   return _mm_ceil_ss(x, y);
 }
 
@@ -196,25 +200,29 @@
 
 __m128d test_mm_floor_pd(__m128d x) {
   // CHECK-LABEL: test_mm_floor_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1)
+  // CHECK: @llvm.floor.v2f64
+  // CHECK-NOT: select
   return _mm_floor_pd(x);
 }
 
 __m128 test_mm_floor_ps(__m128 x) {
   // CHECK-LABEL: test_mm_floor_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1)
+  // CHECK: @llvm.floor.v4f32
+  // CHECK-NOT: select
   return _mm_floor_ps(x);
 }
 
 __m128d test_mm_floor_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_floor_sd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1)
+  // CHECK: @llvm.floor.v2f64
+  // CHECK: select
   return _mm_floor_sd(x, y);
 }
 
 __m128 test_mm_floor_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_floor_ss
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1)
+  // CHECK: @llvm.floor.v4f32
+  // CHECK: select
   return _mm_floor_ss(x, y);
 }
 
Index: clang/test/CodeGen/avx512f-builtins.c
===
--- clang/test/CodeGen/avx512f-builtins.c
+++ clang/test/CodeGen/avx512f-builtins.c
@@ -7485,46 +7485,98 @@
   return _mm512_min_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION);
 }
 
+__m512 test_mm512_floor_ps(__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_floor_ps
+  // CHECK: @llvm.floor.v16f32
+  // CHECK-NOT: select
+  return _mm512_floor_ps(__A);
+}
+
+__m512d test_mm512_floor_pd(__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_floor_pd
+  // CHECK: @llvm.floor.v8f64
+  // CHECK-NOT: select
+  return _mm512_floor_pd(__A);
+}
+
 __m512 test_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_floor_ps 
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  // CHECK-LABEL: @test_mm512_mask_floor_ps
+  // CHECK: @llvm.floor.v16f32
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_floor_ps (__W,__U,__A);
 }
 
 __m512d test_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_floor_pd 
-  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  // CHECK-LABEL: @test_mm512_mask_floor_pd
+  // CHECK: @llvm.floor.v8f64
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_floor_pd (__W,__U,__A);
 }
 
+__m512 test_mm512_ceil_ps(__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_ceil_ps
+  // CHECK: @llvm.ceil.v16f32
+  // CHECK-NOT: select
+  return _mm512_ceil_ps(__A);
+}
+
+__m512d test_mm512_ceil_pd(__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_ceil_pd
+  // CHECK: @llvm.ceil.v8f64
+  // CHECK-NOT: select
+  return _mm512_ceil_pd(__A);
+}
+
 __m512 test_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_ceil_ps 
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  // CHECK-LABEL: @test_mm512_mask_ceil_ps
+  // CHECK: @llvm.ceil.v16f32
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float

[PATCH] D45202: [X86] Replacing X86-specific floor and ceil vector intrinsics with generic LLVM intrinsics

2018-04-05 Thread Mikhail Dvoretckii via Phabricator via cfe-commits
mike.dvoretsky added inline comments.



Comment at: clang/lib/CodeGen/CGBuiltin.cpp:8307
+Dst = Ops[0];
+Mask = llvm::ConstantInt::get(CGF.Builder.getInt32Ty(), 1);
+  } else {

craig.topper wrote:
> I'm not sure we should even try to emit a mask for the legacy scalar 
> intrinsics. Does this get removed well by the middle or backend?
The masking is done to represent all operations handled here in a uniform way. 
D45203 removes it in the backend.


https://reviews.llvm.org/D45202



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits