[PATCH] D46881: [X86][CET] Changing -fcf-protection behavior to comply with gcc (clang part)
mike.dvoretsky created this revision. mike.dvoretsky added a reviewer: craig.topper. Herald added a subscriber: cfe-commits. This patch aims to match the changes introduced in gcc by https://gcc.gnu.org/ml/gcc-cvs/2018-04/msg00534.html. The -mibt feature flag is being removed, and the -fcf-protection option now also defines a __CET__ macro and causes errors when used on non-X86 targets, while X86 targets no longer check for -mibt and -mshstk to determine if -fcf-protection is supported. -mshstk is now used only to determine availability of shadow stack intrinsics. Comes with an LLVM patch. Repository: rC Clang https://reviews.llvm.org/D46881 Files: clang/docs/ClangCommandLineReference.rst clang/include/clang/Basic/DiagnosticCommonKinds.td clang/include/clang/Basic/TargetInfo.h clang/include/clang/Driver/Options.td clang/lib/Basic/TargetInfo.cpp clang/lib/Basic/Targets/X86.cpp clang/lib/Basic/Targets/X86.h clang/lib/Frontend/CompilerInvocation.cpp clang/test/CodeGen/attributes.c clang/test/CodeGen/builtins-x86.c clang/test/CodeGen/x86-cf-protection.c clang/test/Driver/x86-target-features.c clang/test/Preprocessor/x86_target_features.c clang/test/Sema/attr-nocf_check.c clang/test/Sema/attr-nocf_check.cpp Index: clang/test/Sema/attr-nocf_check.cpp === --- clang/test/Sema/attr-nocf_check.cpp +++ clang/test/Sema/attr-nocf_check.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple=i386-unknown-unknown -verify -fcf-protection=branch -target-feature +ibt -std=c++11 -fsyntax-only %s +// RUN: %clang_cc1 -triple=i386-unknown-unknown -verify -fcf-protection=branch -std=c++11 -fsyntax-only %s // Function pointer definition. [[gnu::nocf_check]] typedef void (*FuncPointerWithNoCfCheck)(void); // no-warning Index: clang/test/Sema/attr-nocf_check.c === --- clang/test/Sema/attr-nocf_check.c +++ clang/test/Sema/attr-nocf_check.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -verify -fcf-protection=branch -target-feature +ibt -fsyntax-only %s +// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -verify -fcf-protection=branch -fsyntax-only %s // Function pointer definition. typedef void (*FuncPointerWithNoCfCheck)(void) __attribute__((nocf_check)); // no-warning Index: clang/test/Preprocessor/x86_target_features.c === --- clang/test/Preprocessor/x86_target_features.c +++ clang/test/Preprocessor/x86_target_features.c @@ -380,10 +380,6 @@ // SHSTK: #define __SHSTK__ 1 -// RUN: %clang -target i386-unknown-unknown -mibt -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=IBT %s - -// IBT: #define __IBT__ 1 - // RUN: %clang -target i386-unknown-unknown -march=atom -mrdseed -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=RDSEED %s // RDSEED: #define __RDSEED__ 1 Index: clang/test/Driver/x86-target-features.c === --- clang/test/Driver/x86-target-features.c +++ clang/test/Driver/x86-target-features.c @@ -80,11 +80,6 @@ // CETSS: "-target-feature" "+shstk" // NO-CETSS: "-target-feature" "-shstk" -// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mibt %s -### -o %t.o 2>&1 | FileCheck -check-prefix=CETIBT %s -// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-ibt %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-CETIBT %s -// CETIBT: "-target-feature" "+ibt" -// NO-CETIBT: "-target-feature" "-ibt" - // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -msgx %s -### -o %t.o 2>&1 | FileCheck -check-prefix=SGX %s // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-sgx %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-SGX %s // SGX: "-target-feature" "+sgx" Index: clang/test/CodeGen/x86-cf-protection.c === --- clang/test/CodeGen/x86-cf-protection.c +++ clang/test/CodeGen/x86-cf-protection.c @@ -1,6 +1,8 @@ -// RUN: not %clang_cc1 -fsyntax-only -S -o %t -triple i386-unknown-unknown -fcf-protection=return %s 2>&1 | FileCheck %s --check-prefix=RETURN -// RUN: not %clang_cc1 -fsyntax-only -S -o %t -triple i386-unknown-unknown -fcf-protection=branch %s 2>&1 | FileCheck %s --check-prefix=BRANCH +// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=return %s | FileCheck %s --check-prefix=RETURN +// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=branch %s | FileCheck %s --check-prefix=BRANCH +// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=full %s | FileCheck %s --check-prefix=FULL -// RETURN: error: option 'cf-protection=return' cannot be specified without '-mshstk' -// BRANCH: error: option 'cf-protection=branch' cannot be specified without '-mibt' +// RETURN: #define __CET__ 2 +// BRANCH: #define __CET__
[PATCH] D46881: [X86][CET] Changing -fcf-protection behavior to comply with gcc (clang part)
mike.dvoretsky updated this revision to Diff 147292. mike.dvoretsky added a comment. Removed the unused HasIBT variable declaration from X86.h. https://reviews.llvm.org/D46881 Files: clang/docs/ClangCommandLineReference.rst clang/include/clang/Basic/DiagnosticCommonKinds.td clang/include/clang/Basic/TargetInfo.h clang/include/clang/Driver/Options.td clang/lib/Basic/TargetInfo.cpp clang/lib/Basic/Targets/X86.cpp clang/lib/Basic/Targets/X86.h clang/lib/Frontend/CompilerInvocation.cpp clang/test/CodeGen/attributes.c clang/test/CodeGen/builtins-x86.c clang/test/CodeGen/x86-cf-protection.c clang/test/Driver/x86-target-features.c clang/test/Preprocessor/x86_target_features.c clang/test/Sema/attr-nocf_check.c clang/test/Sema/attr-nocf_check.cpp Index: clang/test/Sema/attr-nocf_check.cpp === --- clang/test/Sema/attr-nocf_check.cpp +++ clang/test/Sema/attr-nocf_check.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple=i386-unknown-unknown -verify -fcf-protection=branch -target-feature +ibt -std=c++11 -fsyntax-only %s +// RUN: %clang_cc1 -triple=i386-unknown-unknown -verify -fcf-protection=branch -std=c++11 -fsyntax-only %s // Function pointer definition. [[gnu::nocf_check]] typedef void (*FuncPointerWithNoCfCheck)(void); // no-warning Index: clang/test/Sema/attr-nocf_check.c === --- clang/test/Sema/attr-nocf_check.c +++ clang/test/Sema/attr-nocf_check.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -verify -fcf-protection=branch -target-feature +ibt -fsyntax-only %s +// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -verify -fcf-protection=branch -fsyntax-only %s // Function pointer definition. typedef void (*FuncPointerWithNoCfCheck)(void) __attribute__((nocf_check)); // no-warning Index: clang/test/Preprocessor/x86_target_features.c === --- clang/test/Preprocessor/x86_target_features.c +++ clang/test/Preprocessor/x86_target_features.c @@ -380,10 +380,6 @@ // SHSTK: #define __SHSTK__ 1 -// RUN: %clang -target i386-unknown-unknown -mibt -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=IBT %s - -// IBT: #define __IBT__ 1 - // RUN: %clang -target i386-unknown-unknown -march=atom -mrdseed -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=RDSEED %s // RDSEED: #define __RDSEED__ 1 Index: clang/test/Driver/x86-target-features.c === --- clang/test/Driver/x86-target-features.c +++ clang/test/Driver/x86-target-features.c @@ -80,11 +80,6 @@ // CETSS: "-target-feature" "+shstk" // NO-CETSS: "-target-feature" "-shstk" -// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mibt %s -### -o %t.o 2>&1 | FileCheck -check-prefix=CETIBT %s -// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-ibt %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-CETIBT %s -// CETIBT: "-target-feature" "+ibt" -// NO-CETIBT: "-target-feature" "-ibt" - // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -msgx %s -### -o %t.o 2>&1 | FileCheck -check-prefix=SGX %s // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-sgx %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-SGX %s // SGX: "-target-feature" "+sgx" Index: clang/test/CodeGen/x86-cf-protection.c === --- clang/test/CodeGen/x86-cf-protection.c +++ clang/test/CodeGen/x86-cf-protection.c @@ -1,6 +1,8 @@ -// RUN: not %clang_cc1 -fsyntax-only -S -o %t -triple i386-unknown-unknown -fcf-protection=return %s 2>&1 | FileCheck %s --check-prefix=RETURN -// RUN: not %clang_cc1 -fsyntax-only -S -o %t -triple i386-unknown-unknown -fcf-protection=branch %s 2>&1 | FileCheck %s --check-prefix=BRANCH +// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=return %s | FileCheck %s --check-prefix=RETURN +// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=branch %s | FileCheck %s --check-prefix=BRANCH +// RUN: %clang -target i386-unknown-unknown -x c -E -dM -o - -fcf-protection=full %s | FileCheck %s --check-prefix=FULL -// RETURN: error: option 'cf-protection=return' cannot be specified without '-mshstk' -// BRANCH: error: option 'cf-protection=branch' cannot be specified without '-mibt' +// RETURN: #define __CET__ 2 +// BRANCH: #define __CET__ 1 +// FULL: #define __CET__ 3 void foo() {} Index: clang/test/CodeGen/builtins-x86.c === --- clang/test/CodeGen/builtins-x86.c +++ clang/test/CodeGen/builtins-x86.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -DUSE_64 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +clzero -target-fea
[PATCH] D45202: [X86] Replacing X86-specific floor and ceil vector intrinsics with generic LLVM intrinsics
mike.dvoretsky updated this revision to Diff 149484. mike.dvoretsky added a comment. Changed the scalar intrinsic lowering to work via extract-insert. https://reviews.llvm.org/D45203 contains tests for folding the resulting IR patterns. https://reviews.llvm.org/D45202 Files: clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/avx-builtins.c clang/test/CodeGen/avx512f-builtins.c clang/test/CodeGen/sse41-builtins.c Index: clang/test/CodeGen/sse41-builtins.c === --- clang/test/CodeGen/sse41-builtins.c +++ clang/test/CodeGen/sse41-builtins.c @@ -44,25 +44,31 @@ __m128d test_mm_ceil_pd(__m128d x) { // CHECK-LABEL: test_mm_ceil_pd - // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v2f64 + // CHECK-NOT: select return _mm_ceil_pd(x); } __m128 test_mm_ceil_ps(__m128 x) { // CHECK-LABEL: test_mm_ceil_ps - // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v4f32 + // CHECK-NOT: select return _mm_ceil_ps(x); } __m128d test_mm_ceil_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_ceil_sd - // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2) + // CHECK: extractelement + // CHECK: @llvm.ceil.f64 + // CHECK: insertelement return _mm_ceil_sd(x, y); } __m128 test_mm_ceil_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_ceil_ss - // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2) + // CHECK: extractelement + // CHECK: @llvm.ceil.f32 + // CHECK: insertelement return _mm_ceil_ss(x, y); } @@ -196,25 +202,31 @@ __m128d test_mm_floor_pd(__m128d x) { // CHECK-LABEL: test_mm_floor_pd - // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v2f64 + // CHECK-NOT: select return _mm_floor_pd(x); } __m128 test_mm_floor_ps(__m128 x) { // CHECK-LABEL: test_mm_floor_ps - // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v4f32 + // CHECK-NOT: select return _mm_floor_ps(x); } __m128d test_mm_floor_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_floor_sd - // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1) + // CHECK: extractelement + // CHECK: @llvm.floor.f64 + // CHECK: insertelement return _mm_floor_sd(x, y); } __m128 test_mm_floor_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_floor_ss - // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1) + // CHECK: extractelement + // CHECK: @llvm.floor.f32 + // CHECK: insertelement return _mm_floor_ss(x, y); } Index: clang/test/CodeGen/avx512f-builtins.c === --- clang/test/CodeGen/avx512f-builtins.c +++ clang/test/CodeGen/avx512f-builtins.c @@ -7565,46 +7565,98 @@ return _mm512_min_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION); } +__m512 test_mm512_floor_ps(__m512 __A) +{ + // CHECK-LABEL: @test_mm512_floor_ps + // CHECK: @llvm.floor.v16f32 + // CHECK-NOT: select + return _mm512_floor_ps(__A); +} + +__m512d test_mm512_floor_pd(__m512d __A) +{ + // CHECK-LABEL: @test_mm512_floor_pd + // CHECK: @llvm.floor.v8f64 + // CHECK-NOT: select + return _mm512_floor_pd(__A); +} + __m512 test_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) { - // CHECK-LABEL: @test_mm512_mask_floor_ps - // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512 + // CHECK-LABEL: @test_mm512_mask_floor_ps + // CHECK: @llvm.floor.v16f32 + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_floor_ps (__W,__U,__A); } __m512d test_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) { - // CHECK-LABEL: @test_mm512_mask_floor_pd - // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512 + // CHECK-LABEL: @test_mm512_mask_floor_pd + // CHECK: @llvm.floor.v8f64 + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_floor_pd (__W,__U,__A); } +__m512 test_mm512_ceil_ps(__m512 __A) +{ + // CHECK-LABEL: @test_mm512_ceil_ps + // CHECK: @llvm.ceil.v16f32 + // CHECK-NOT: select + return _mm512_ceil_ps(__A); +} + +__m512d test_mm512_ceil_pd(__m512d __A) +{ + // CHECK-LABEL: @test_mm512_ceil_pd + // CHECK: @llvm.ceil.v8f64 + // CHECK-NOT: select + return _mm512_ceil_pd(__A); +} + __m512 test_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) { - // CHECK-LABEL: @test_mm512_mask_ceil_ps - // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512 + // CHECK-LABEL: @test_mm512_mask_ceil_ps + // CHECK: @llvm.ceil.v16f32 + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_ceil_ps (__W,__U,__A); }
[PATCH] D45720: [X86] Lowering PACK*S (pack with saturation) intrinsics to native IR (clang side)
mike.dvoretsky abandoned this revision. mike.dvoretsky added a comment. Closing this due to failure of https://reviews.llvm.org/D45721. https://reviews.llvm.org/D45720 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D45722: [X86] Lowering SAD (sum of absolute differences) intrinsics to native IR (clang side)
mike.dvoretsky abandoned this revision. mike.dvoretsky added a comment. Closing this due to failure of https://reviews.llvm.org/D45723. https://reviews.llvm.org/D45722 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D45202: [X86] Replacing X86-specific floor and ceil vector intrinsics with generic LLVM intrinsics
mike.dvoretsky abandoned this revision. mike.dvoretsky added a comment. Abandoning this due to https://reviews.llvm.org/D48067 being accepted instead. https://reviews.llvm.org/D45202 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D45720: [X86] Lowering PACK*S (pack with saturation) intrinsics to native IR (clang side)
mike.dvoretsky created this revision. mike.dvoretsky added reviewers: craig.topper, spatel. Herald added a subscriber: cfe-commits. This patch lowers the X86 vector packing with saturation intrinsics to native LLVM IR. Comes with an LLVM patch. Repository: rC Clang https://reviews.llvm.org/D45720 Files: lib/CodeGen/CGBuiltin.cpp test/CodeGen/avx2-builtins.c test/CodeGen/avx512bw-builtins.c test/CodeGen/avx512vlbw-builtins.c test/CodeGen/sse2-builtins.c test/CodeGen/sse41-builtins.c Index: test/CodeGen/sse41-builtins.c === --- test/CodeGen/sse41-builtins.c +++ test/CodeGen/sse41-builtins.c @@ -328,7 +328,12 @@ __m128i test_mm_packus_epi32(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_packus_epi32 - // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, zeroinitializer + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> zeroinitializer + // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16> return _mm_packus_epi32(x, y); } Index: test/CodeGen/sse2-builtins.c === --- test/CodeGen/sse2-builtins.c +++ test/CodeGen/sse2-builtins.c @@ -869,19 +869,34 @@ __m128i test_mm_packs_epi16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_packs_epi16 - // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: %{{.*}} = shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> + // CHECK: %{{.*}} = icmp slt <16 x i16> %{{.*}}, + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> + // CHECK: %{{.*}} = icmp sgt <16 x i16> %{{.*}}, + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> + // CHECK: %{{.*}} = trunc <16 x i16> %{{.*}} to <16 x i8> return _mm_packs_epi16(A, B); } __m128i test_mm_packs_epi32(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_packs_epi32 - // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16> return _mm_packs_epi32(A, B); } __m128i test_mm_packus_epi16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_packus_epi16 - // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: %{{.*}} = shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> + // CHECK: %{{.*}} = icmp slt <16 x i16> %{{.*}}, + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> + // CHECK: %{{.*}} = icmp sgt <16 x i16> %{{.*}}, zeroinitializer + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> zeroinitializer + // CHECK: %{{.*}} = trunc <16 x i16> %{{.*}} to <16 x i8> return _mm_packus_epi16(A, B); } Index: test/CodeGen/avx512vlbw-builtins.c === --- test/CodeGen/avx512vlbw-builtins.c +++ test/CodeGen/avx512vlbw-builtins.c @@ -970,105 +970,185 @@ __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_maskz_packs_epi32 - // CHECK: @llvm.x86.sse2.packssdw + // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_packs_epi32(__M,__A,__B); } __m128i test_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_mask_packs_epi32 - // CHECK: @llvm.x86.sse2.packssdw + // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16> //
[PATCH] D45722: [X86] Lowering SAD (sum of absolute differences) intrinsics to native IR (clang side)
mike.dvoretsky created this revision. mike.dvoretsky added reviewers: craig.topper, spatel. Herald added a subscriber: cfe-commits. This patch lowers the SAD intrinsics to native LLVM IR. Comes with an LLVM patch. Repository: rC Clang https://reviews.llvm.org/D45722 Files: lib/CodeGen/CGBuiltin.cpp test/CodeGen/avx2-builtins.c test/CodeGen/avx512bw-builtins.c test/CodeGen/sse2-builtins.c Index: test/CodeGen/sse2-builtins.c === --- test/CodeGen/sse2-builtins.c +++ test/CodeGen/sse2-builtins.c @@ -893,7 +893,33 @@ __m128i test_mm_sad_epu8(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_sad_epu8 - // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: %{{.*}} = icmp ugt <16 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} return _mm_sad_epu8(A, B); } Index: test/CodeGen/avx512bw-builtins.c === --- test/CodeGen/avx512bw-builtins.c +++ test/CodeGen/avx512bw-builtins.c @@ -1945,7 +1945,33 @@ __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_sad_epu8 - // CHECK: @llvm.x86.avx512.psad.bw.512 + // CHECK: %{{.*}} = icmp ugt <64 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} return _mm512_sad_epu8(__A, __B); } Index: test/CodeGen/avx2-builtins.c === --- test/CodeGen/avx2-builtins.c +++ test/CodeGen/avx2-builtins.c @@ -943,7 +943,33 @@ __m256i test_mm256_sad_epu8(__m256i x, __m256i y) { // CHECK-LABEL:
[PATCH] D45720: [X86] Lowering PACK*S (pack with saturation) intrinsics to native IR (clang side)
mike.dvoretsky added inline comments. Comment at: lib/CodeGen/CGBuiltin.cpp:8443 + Value *MaxVec = llvm::ConstantInt::get(RTy, MaxVal); + Res = EmitX86MinMax(CGF, ICmpInst::ICMP_SLT, {Res, MaxVec}); + Res = EmitX86MinMax(CGF, ICmpInst::ICMP_SGT, {Res, MinVec}); craig.topper wrote: > Why arent' these unsigned compares for Unsigned? The compares are signed on purpose. PACKUS assumes that the input elements are signed, then uses unsigned saturation. So, for instance, an 0x value must be evaluated as -1 and saturated to 0, rather than to 0xff as it would be with unsigned comparisons. Repository: rC Clang https://reviews.llvm.org/D45720 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D45720: [X86] Lowering PACK*S (pack with saturation) intrinsics to native IR (clang side)
mike.dvoretsky updated this revision to Diff 142899. mike.dvoretsky added a comment. Updated per comments. https://reviews.llvm.org/D45720 Files: clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/avx2-builtins.c clang/test/CodeGen/avx512bw-builtins.c clang/test/CodeGen/avx512vlbw-builtins.c clang/test/CodeGen/sse2-builtins.c clang/test/CodeGen/sse41-builtins.c Index: clang/test/CodeGen/sse41-builtins.c === --- clang/test/CodeGen/sse41-builtins.c +++ clang/test/CodeGen/sse41-builtins.c @@ -328,7 +328,12 @@ __m128i test_mm_packus_epi32(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_packus_epi32 - // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, zeroinitializer + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> zeroinitializer + // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16> return _mm_packus_epi32(x, y); } Index: clang/test/CodeGen/sse2-builtins.c === --- clang/test/CodeGen/sse2-builtins.c +++ clang/test/CodeGen/sse2-builtins.c @@ -869,19 +869,34 @@ __m128i test_mm_packs_epi16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_packs_epi16 - // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: %{{.*}} = shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> + // CHECK: %{{.*}} = icmp slt <16 x i16> %{{.*}}, + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> + // CHECK: %{{.*}} = icmp sgt <16 x i16> %{{.*}}, + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> + // CHECK: %{{.*}} = trunc <16 x i16> %{{.*}} to <16 x i8> return _mm_packs_epi16(A, B); } __m128i test_mm_packs_epi32(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_packs_epi32 - // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16> return _mm_packs_epi32(A, B); } __m128i test_mm_packus_epi16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_packus_epi16 - // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: %{{.*}} = shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> + // CHECK: %{{.*}} = icmp slt <16 x i16> %{{.*}}, + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> + // CHECK: %{{.*}} = icmp sgt <16 x i16> %{{.*}}, zeroinitializer + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> zeroinitializer + // CHECK: %{{.*}} = trunc <16 x i16> %{{.*}} to <16 x i8> return _mm_packus_epi16(A, B); } Index: clang/test/CodeGen/avx512vlbw-builtins.c === --- clang/test/CodeGen/avx512vlbw-builtins.c +++ clang/test/CodeGen/avx512vlbw-builtins.c @@ -970,105 +970,185 @@ __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_maskz_packs_epi32 - // CHECK: @llvm.x86.sse2.packssdw + // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_packs_epi32(__M,__A,__B); } __m128i test_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_mask_packs_epi32 - // CHECK: @llvm.x86.sse2.packssdw + // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} r
[PATCH] D45722: [X86] Lowering SAD (sum of absolute differences) intrinsics to native IR (clang side)
mike.dvoretsky updated this revision to Diff 142914. mike.dvoretsky added a comment. Updated per comments. https://reviews.llvm.org/D45722 Files: clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/avx2-builtins.c clang/test/CodeGen/avx512bw-builtins.c clang/test/CodeGen/sse2-builtins.c Index: clang/test/CodeGen/sse2-builtins.c === --- clang/test/CodeGen/sse2-builtins.c +++ clang/test/CodeGen/sse2-builtins.c @@ -893,7 +893,33 @@ __m128i test_mm_sad_epu8(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_sad_epu8 - // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: %{{.*}} = icmp ugt <16 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} return _mm_sad_epu8(A, B); } Index: clang/test/CodeGen/avx512bw-builtins.c === --- clang/test/CodeGen/avx512bw-builtins.c +++ clang/test/CodeGen/avx512bw-builtins.c @@ -1945,7 +1945,33 @@ __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_sad_epu8 - // CHECK: @llvm.x86.avx512.psad.bw.512 + // CHECK: %{{.*}} = icmp ugt <64 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> undef, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} return _mm512_sad_epu8(__A, __B); } Index: clang/test/CodeGen/avx2-builtins.c === --- clang/test/CodeGen/avx2-builtins.c +++ clang/test/CodeGen/avx2-builtins.c @@ -943,7 +943,33 @@ __m256i test_mm256_sad_epu8(__m256i x, __m256i y) { // CHECK-LABEL: test_mm256_sad_epu8 - // CHECK: call <4 x i64> @llvm.
[PATCH] D45722: [X86] Lowering SAD (sum of absolute differences) intrinsics to native IR (clang side)
mike.dvoretsky updated this revision to Diff 143715. mike.dvoretsky added a comment. Updated per comments. https://reviews.llvm.org/D45722 Files: clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/avx2-builtins.c clang/test/CodeGen/avx512bw-builtins.c clang/test/CodeGen/sse2-builtins.c Index: clang/test/CodeGen/sse2-builtins.c === --- clang/test/CodeGen/sse2-builtins.c +++ clang/test/CodeGen/sse2-builtins.c @@ -893,7 +893,33 @@ __m128i test_mm_sad_epu8(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_sad_epu8 - // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: %{{.*}} = icmp ugt <16 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} return _mm_sad_epu8(A, B); } Index: clang/test/CodeGen/avx512bw-builtins.c === --- clang/test/CodeGen/avx512bw-builtins.c +++ clang/test/CodeGen/avx512bw-builtins.c @@ -1945,7 +1945,33 @@ __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_sad_epu8 - // CHECK: @llvm.x86.avx512.psad.bw.512 + // CHECK: %{{.*}} = icmp ugt <64 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} return _mm512_sad_epu8(__A, __B); } Index: clang/test/CodeGen/avx2-builtins.c === --- clang/test/CodeGen/avx2-builtins.c +++ clang/test/CodeGen/avx2-builtins.c @@ -943,7 +943,33 @@ __m256i test_mm256_sad_epu8(__m256i x, __m256i y) { // CHECK-LABEL: test_mm256_sad_epu8 -
[PATCH] D45722: [X86] Lowering SAD (sum of absolute differences) intrinsics to native IR (clang side)
mike.dvoretsky updated this revision to Diff 143893. mike.dvoretsky marked 4 inline comments as done. mike.dvoretsky added a subscriber: ashlykov. mike.dvoretsky added a comment. Updated per comments. https://reviews.llvm.org/D45722 Files: clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/avx2-builtins.c clang/test/CodeGen/avx512bw-builtins.c clang/test/CodeGen/sse2-builtins.c Index: clang/test/CodeGen/sse2-builtins.c === --- clang/test/CodeGen/sse2-builtins.c +++ clang/test/CodeGen/sse2-builtins.c @@ -893,7 +893,33 @@ __m128i test_mm_sad_epu8(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_sad_epu8 - // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: %{{.*}} = icmp ugt <16 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <16 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <2 x i32> + // CHECK: %{{.*}} = zext <2 x i8> %{{.*}} to <2 x i64> + // CHECK: %{{.*}} = add <2 x i64> %{{.*}}, %{{.*}} return _mm_sad_epu8(A, B); } Index: clang/test/CodeGen/avx512bw-builtins.c === --- clang/test/CodeGen/avx512bw-builtins.c +++ clang/test/CodeGen/avx512bw-builtins.c @@ -1945,7 +1945,33 @@ __m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_sad_epu8 - // CHECK: @llvm.x86.avx512.psad.bw.512 + // CHECK: %{{.*}} = icmp ugt <64 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = sub <64 x i8> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = zext <8 x i8> %{{.*}} to <8 x i64> + // CHECK: %{{.*}} = add <8 x i64> %{{.*}}, %{{.*}} return _mm512_sad_epu8(__A, __B); } Index: clang/test/CodeGen/avx2-builtins.c === --- clang/test/CodeGen/avx2-builtins.c +++ clang/test/CodeGen/avx2-builtins.c @@ -943,7 +943,33 @@
[PATCH] D45720: [X86] Lowering PACK*S (pack with saturation) intrinsics to native IR (clang side)
mike.dvoretsky updated this revision to Diff 144126. mike.dvoretsky added a comment. Changed the shuffle mask emission code to match https://reviews.llvm.org/D45721. https://reviews.llvm.org/D45720 Files: clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/avx2-builtins.c clang/test/CodeGen/avx512bw-builtins.c clang/test/CodeGen/avx512vlbw-builtins.c clang/test/CodeGen/sse2-builtins.c clang/test/CodeGen/sse41-builtins.c Index: clang/test/CodeGen/sse41-builtins.c === --- clang/test/CodeGen/sse41-builtins.c +++ clang/test/CodeGen/sse41-builtins.c @@ -328,7 +328,12 @@ __m128i test_mm_packus_epi32(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_packus_epi32 - // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, zeroinitializer + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> zeroinitializer + // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16> return _mm_packus_epi32(x, y); } Index: clang/test/CodeGen/sse2-builtins.c === --- clang/test/CodeGen/sse2-builtins.c +++ clang/test/CodeGen/sse2-builtins.c @@ -869,19 +869,34 @@ __m128i test_mm_packs_epi16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_packs_epi16 - // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: %{{.*}} = shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> + // CHECK: %{{.*}} = icmp slt <16 x i16> %{{.*}}, + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> + // CHECK: %{{.*}} = icmp sgt <16 x i16> %{{.*}}, + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> + // CHECK: %{{.*}} = trunc <16 x i16> %{{.*}} to <16 x i8> return _mm_packs_epi16(A, B); } __m128i test_mm_packs_epi32(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_packs_epi32 - // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16> return _mm_packs_epi32(A, B); } __m128i test_mm_packus_epi16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_packus_epi16 - // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: %{{.*}} = shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> + // CHECK: %{{.*}} = icmp slt <16 x i16> %{{.*}}, + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> + // CHECK: %{{.*}} = icmp sgt <16 x i16> %{{.*}}, zeroinitializer + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> zeroinitializer + // CHECK: %{{.*}} = trunc <16 x i16> %{{.*}} to <16 x i8> return _mm_packus_epi16(A, B); } Index: clang/test/CodeGen/avx512vlbw-builtins.c === --- clang/test/CodeGen/avx512vlbw-builtins.c +++ clang/test/CodeGen/avx512vlbw-builtins.c @@ -970,105 +970,185 @@ __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_maskz_packs_epi32 - // CHECK: @llvm.x86.sse2.packssdw + // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_packs_epi32(__M,__A,__B); } __m128i test_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_mask_packs_epi32 - // CHECK: @llvm.x86.sse2.packssdw + // CHECK: %{{.*}} = shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp slt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = icmp sgt <8 x i32> %{{.*}}, + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> + // CHECK: %{{.*}} = trunc <8 x i32> %{{.*}} to <8 x i16> // CHECK: select
[PATCH] D48712: [X86] Lowering integer truncation intrinsics to native IR
mike.dvoretsky created this revision. mike.dvoretsky added a reviewer: craig.topper. Herald added a subscriber: cfe-commits. This patch lowers the _mm[256|512]_cvtepi{64|32|16}_epi{32|16|8} intrinsics to native IR in cases where the result's length is less than 128 bits. The resulting IR is folded into VPMOV instructions in https://reviews.llvm.org/D46957, with the exception of _mm_cvtepi64_epi8, where a PSHUFB instruction is currently produced instead. Repository: rC Clang https://reviews.llvm.org/D48712 Files: clang/lib/Headers/avx512vlbwintrin.h clang/lib/Headers/avx512vlintrin.h clang/test/CodeGen/avx512vl-builtins.c clang/test/CodeGen/avx512vlbw-builtins.c Index: clang/test/CodeGen/avx512vlbw-builtins.c === --- clang/test/CodeGen/avx512vlbw-builtins.c +++ clang/test/CodeGen/avx512vlbw-builtins.c @@ -1792,7 +1792,8 @@ __m128i test_mm_cvtepi16_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi16_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.wb.128 + // CHECK: trunc <8 x i16> %{{.*}} to <8 x i8> + // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi16_epi8(__A); } Index: clang/test/CodeGen/avx512vl-builtins.c === --- clang/test/CodeGen/avx512vl-builtins.c +++ clang/test/CodeGen/avx512vl-builtins.c @@ -6974,7 +6974,8 @@ __m128i test_mm_cvtepi32_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi32_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.db.128 + // CHECK: trunc <4 x i32> %{{.*}} to <4 x i8> + // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi32_epi8(__A); } @@ -6998,7 +6999,8 @@ __m128i test_mm256_cvtepi32_epi8(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi32_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.db.256 + // CHECK: trunc <8 x i32> %{{.*}} to <8 x i8> + // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> return _mm256_cvtepi32_epi8(__A); } @@ -7022,7 +7024,8 @@ __m128i test_mm_cvtepi32_epi16(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi32_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.dw.128 + // CHECK: trunc <4 x i32> %{{.*}} to <4 x i16> + // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> return _mm_cvtepi32_epi16(__A); } @@ -7070,7 +7073,8 @@ __m128i test_mm_cvtepi64_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.qb.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i8> + // CHECK: shufflevector <2 x i8> %{{.*}}, <2 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi64_epi8(__A); } @@ -7094,7 +7098,8 @@ __m128i test_mm256_cvtepi64_epi8(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi64_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.qb.256 + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i8> + // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> return _mm256_cvtepi64_epi8(__A); } @@ -7118,7 +7123,8 @@ __m128i test_mm_cvtepi64_epi32(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi32 - // CHECK: @llvm.x86.avx512.mask.pmov.qd.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + // CHECK: shufflevector <2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <4 x i32> return _mm_cvtepi64_epi32(__A); } @@ -7168,7 +7174,8 @@ __m128i test_mm_cvtepi64_epi16(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.qw.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i16> + // CHECK: shufflevector <2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <8 x i32> return _mm_cvtepi64_epi16(__A); } @@ -7192,7 +7199,8 @@ __m128i test_mm256_cvtepi64_epi16(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi64_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.qw.256 + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i16> + // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> return _mm256_cvtepi64_epi16(__A); } Index: clang/lib/Headers/avx512vlintrin.h === --- clang/lib/Headers/avx512vlintrin.h +++ clang/lib/Headers/avx512vlintrin.h @@ -30,6 +30,7 @@ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"))) +typedef short __v2hi __attribute__((__vector_size__(4))); typedef char __v4qi __attribute__((__vector_size__(4))); typedef char __v2qi __attribute__((__vector_size__(2))); @@ -7415,10 +7416,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi8 (__m128i __A) { - return __builtin_shufflevector(__builtin_convertvector((__v4si)__A, __v4qi), - (__v4qi) {0, 0, 0, 0}, - 0, 1, 2, 3, 4, 5, 6, 7, - 7, 7, 7, 7, 7, 7, 7, 7); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4si)__A, __v4qi), (__v4q
[PATCH] D48712: [X86] Lowering integer truncation intrinsics to native IR
mike.dvoretsky updated this revision to Diff 153277. mike.dvoretsky added a comment. Uploaded the correct diff. https://reviews.llvm.org/D48712 Files: clang/lib/Headers/avx512vlbwintrin.h clang/lib/Headers/avx512vlintrin.h clang/test/CodeGen/avx512vl-builtins.c clang/test/CodeGen/avx512vlbw-builtins.c Index: clang/test/CodeGen/avx512vlbw-builtins.c === --- clang/test/CodeGen/avx512vlbw-builtins.c +++ clang/test/CodeGen/avx512vlbw-builtins.c @@ -1792,7 +1792,8 @@ __m128i test_mm_cvtepi16_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi16_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.wb.128 + // CHECK: trunc <8 x i16> %{{.*}} to <8 x i8> + // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi16_epi8(__A); } Index: clang/test/CodeGen/avx512vl-builtins.c === --- clang/test/CodeGen/avx512vl-builtins.c +++ clang/test/CodeGen/avx512vl-builtins.c @@ -6974,7 +6974,8 @@ __m128i test_mm_cvtepi32_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi32_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.db.128 + // CHECK: trunc <4 x i32> %{{.*}} to <4 x i8> + // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi32_epi8(__A); } @@ -6998,7 +6999,8 @@ __m128i test_mm256_cvtepi32_epi8(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi32_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.db.256 + // CHECK: trunc <8 x i32> %{{.*}} to <8 x i8> + // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> return _mm256_cvtepi32_epi8(__A); } @@ -7022,7 +7024,8 @@ __m128i test_mm_cvtepi32_epi16(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi32_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.dw.128 + // CHECK: trunc <4 x i32> %{{.*}} to <4 x i16> + // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> return _mm_cvtepi32_epi16(__A); } @@ -7070,7 +7073,8 @@ __m128i test_mm_cvtepi64_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.qb.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i8> + // CHECK: shufflevector <2 x i8> %{{.*}}, <2 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi64_epi8(__A); } @@ -7094,7 +7098,8 @@ __m128i test_mm256_cvtepi64_epi8(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi64_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.qb.256 + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i8> + // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> return _mm256_cvtepi64_epi8(__A); } @@ -7118,7 +7123,8 @@ __m128i test_mm_cvtepi64_epi32(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi32 - // CHECK: @llvm.x86.avx512.mask.pmov.qd.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + // CHECK: shufflevector <2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <4 x i32> return _mm_cvtepi64_epi32(__A); } @@ -7168,7 +7174,8 @@ __m128i test_mm_cvtepi64_epi16(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.qw.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i16> + // CHECK: shufflevector <2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <8 x i32> return _mm_cvtepi64_epi16(__A); } @@ -7192,7 +7199,8 @@ __m128i test_mm256_cvtepi64_epi16(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi64_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.qw.256 + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i16> + // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> return _mm256_cvtepi64_epi16(__A); } Index: clang/lib/Headers/avx512vlintrin.h === --- clang/lib/Headers/avx512vlintrin.h +++ clang/lib/Headers/avx512vlintrin.h @@ -30,6 +30,10 @@ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"))) +typedef short __v2hi __attribute__((__vector_size__(4))); +typedef char __v4qi __attribute__((__vector_size__(4))); +typedef char __v2qi __attribute__((__vector_size__(2))); + /* Integer compare */ #define _mm_cmpeq_epi32_mask(A, B) \ @@ -7412,9 +7416,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi8 (__m128i __A) { - return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -7442,9 +7446,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm256_cvtepi32_epi8 (__m256i __A) { - return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( +
[PATCH] D48712: [X86] Lowering integer truncation intrinsics to native IR
mike.dvoretsky updated this revision to Diff 153471. mike.dvoretsky marked 2 inline comments as done. mike.dvoretsky added a comment. Updated per comments. Typedefs for intermediate short vectors moved into the bodies of the functions using them. https://reviews.llvm.org/D48712 Files: clang/lib/Headers/avx512vlbwintrin.h clang/lib/Headers/avx512vlintrin.h clang/test/CodeGen/avx512vl-builtins.c clang/test/CodeGen/avx512vlbw-builtins.c Index: clang/test/CodeGen/avx512vlbw-builtins.c === --- clang/test/CodeGen/avx512vlbw-builtins.c +++ clang/test/CodeGen/avx512vlbw-builtins.c @@ -1792,7 +1792,8 @@ __m128i test_mm_cvtepi16_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi16_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.wb.128 + // CHECK: trunc <8 x i16> %{{.*}} to <8 x i8> + // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi16_epi8(__A); } Index: clang/test/CodeGen/avx512vl-builtins.c === --- clang/test/CodeGen/avx512vl-builtins.c +++ clang/test/CodeGen/avx512vl-builtins.c @@ -6974,7 +6974,8 @@ __m128i test_mm_cvtepi32_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi32_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.db.128 + // CHECK: trunc <4 x i32> %{{.*}} to <4 x i8> + // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi32_epi8(__A); } @@ -6998,7 +6999,8 @@ __m128i test_mm256_cvtepi32_epi8(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi32_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.db.256 + // CHECK: trunc <8 x i32> %{{.*}} to <8 x i8> + // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> return _mm256_cvtepi32_epi8(__A); } @@ -7022,7 +7024,8 @@ __m128i test_mm_cvtepi32_epi16(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi32_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.dw.128 + // CHECK: trunc <4 x i32> %{{.*}} to <4 x i16> + // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> return _mm_cvtepi32_epi16(__A); } @@ -7070,7 +7073,8 @@ __m128i test_mm_cvtepi64_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.qb.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i8> + // CHECK: shufflevector <2 x i8> %{{.*}}, <2 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi64_epi8(__A); } @@ -7094,7 +7098,8 @@ __m128i test_mm256_cvtepi64_epi8(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi64_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.qb.256 + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i8> + // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> return _mm256_cvtepi64_epi8(__A); } @@ -7118,7 +7123,8 @@ __m128i test_mm_cvtepi64_epi32(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi32 - // CHECK: @llvm.x86.avx512.mask.pmov.qd.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + // CHECK: shufflevector <2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <4 x i32> return _mm_cvtepi64_epi32(__A); } @@ -7168,7 +7174,8 @@ __m128i test_mm_cvtepi64_epi16(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.qw.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i16> + // CHECK: shufflevector <2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <8 x i32> return _mm_cvtepi64_epi16(__A); } @@ -7192,7 +7199,8 @@ __m128i test_mm256_cvtepi64_epi16(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi64_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.qw.256 + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i16> + // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> return _mm256_cvtepi64_epi16(__A); } Index: clang/lib/Headers/avx512vlintrin.h === --- clang/lib/Headers/avx512vlintrin.h +++ clang/lib/Headers/avx512vlintrin.h @@ -7412,9 +7412,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi8 (__m128i __A) { - return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); + typedef char __v4qi __attribute__((__vector_size__(4))); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -7442,9 +7443,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm256_cvtepi32_epi8 (__m256i __A) { - return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v8si)__A, __v8qi), + (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); } static _
[PATCH] D48712: [X86] Lowering integer truncation intrinsics to native IR
mike.dvoretsky added inline comments. Comment at: clang/lib/Headers/avx512vlbwintrin.h:1501 + (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); } RKSimon wrote: > Are we happy with using illegal types like this? What about flipping the > shuffle and convert? > > ``` > return (__m128i)__builtin_convertvector( > __builtin_shufflevector((__v8hi)__A, > (__v8hi){0, 0, 0, 0, 0, 0, 0, 0}, > 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, > 11, 12, 13, 14, 15), __v16qi); > ``` This would bring its own issues, since in the cvtepi64_epi8 cases the inner shuffle would produce vectors of 16 64-bit values. There would be no extra typedef, but in the back-end these would be split in type legalization, making it harder to fold them into VPMOV instructions. https://reviews.llvm.org/D48712 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D48712: [X86] Lowering integer truncation intrinsics to native IR
This revision was automatically updated to reflect the committed changes. Closed by commit rC336643: [X86] Lowering integer truncation intrinsics to native IR (authored by mike.dvoretsky, committed by ). Changed prior to commit: https://reviews.llvm.org/D48712?vs=153471&id=154765#toc Repository: rC Clang https://reviews.llvm.org/D48712 Files: lib/Headers/avx512vlbwintrin.h lib/Headers/avx512vlintrin.h test/CodeGen/avx512vl-builtins.c test/CodeGen/avx512vlbw-builtins.c Index: test/CodeGen/avx512vl-builtins.c === --- test/CodeGen/avx512vl-builtins.c +++ test/CodeGen/avx512vl-builtins.c @@ -8503,7 +8503,8 @@ __m128i test_mm_cvtepi32_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi32_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.db.128 + // CHECK: trunc <4 x i32> %{{.*}} to <4 x i8> + // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi32_epi8(__A); } @@ -8527,7 +8528,8 @@ __m128i test_mm256_cvtepi32_epi8(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi32_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.db.256 + // CHECK: trunc <8 x i32> %{{.*}} to <8 x i8> + // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> return _mm256_cvtepi32_epi8(__A); } @@ -8551,7 +8553,8 @@ __m128i test_mm_cvtepi32_epi16(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi32_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.dw.128 + // CHECK: trunc <4 x i32> %{{.*}} to <4 x i16> + // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> return _mm_cvtepi32_epi16(__A); } @@ -8599,7 +8602,8 @@ __m128i test_mm_cvtepi64_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.qb.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i8> + // CHECK: shufflevector <2 x i8> %{{.*}}, <2 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi64_epi8(__A); } @@ -8623,7 +8627,8 @@ __m128i test_mm256_cvtepi64_epi8(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi64_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.qb.256 + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i8> + // CHECK: shufflevector <4 x i8> %{{.*}}, <4 x i8> %{{.*}}, <16 x i32> return _mm256_cvtepi64_epi8(__A); } @@ -8647,7 +8652,8 @@ __m128i test_mm_cvtepi64_epi32(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi32 - // CHECK: @llvm.x86.avx512.mask.pmov.qd.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i32> + // CHECK: shufflevector <2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <4 x i32> return _mm_cvtepi64_epi32(__A); } @@ -8697,7 +8703,8 @@ __m128i test_mm_cvtepi64_epi16(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi64_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.qw.128 + // CHECK: trunc <2 x i64> %{{.*}} to <2 x i16> + // CHECK: shufflevector <2 x i16> %{{.*}}, <2 x i16> %{{.*}}, <8 x i32> return _mm_cvtepi64_epi16(__A); } @@ -8721,7 +8728,8 @@ __m128i test_mm256_cvtepi64_epi16(__m256i __A) { // CHECK-LABEL: @test_mm256_cvtepi64_epi16 - // CHECK: @llvm.x86.avx512.mask.pmov.qw.256 + // CHECK: trunc <4 x i64> %{{.*}} to <4 x i16> + // CHECK: shufflevector <4 x i16> %{{.*}}, <4 x i16> %{{.*}}, <8 x i32> return _mm256_cvtepi64_epi16(__A); } Index: test/CodeGen/avx512vlbw-builtins.c === --- test/CodeGen/avx512vlbw-builtins.c +++ test/CodeGen/avx512vlbw-builtins.c @@ -1792,7 +1792,8 @@ __m128i test_mm_cvtepi16_epi8(__m128i __A) { // CHECK-LABEL: @test_mm_cvtepi16_epi8 - // CHECK: @llvm.x86.avx512.mask.pmov.wb.128 + // CHECK: trunc <8 x i16> %{{.*}} to <8 x i8> + // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi16_epi8(__A); } Index: lib/Headers/avx512vlintrin.h === --- lib/Headers/avx512vlintrin.h +++ lib/Headers/avx512vlintrin.h @@ -31,6 +31,10 @@ #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(128))) #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(256))) +typedef short __v2hi __attribute__((__vector_size__(4))); +typedef char __v4qi __attribute__((__vector_size__(4))); +typedef char __v2qi __attribute__((__vector_size__(2))); + /* Integer compare */ #define _mm_cmpeq_epi32_mask(A, B) \ @@ -7341,9 +7345,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtepi32_epi8 (__m128i __A) { - return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -7371,9 +73
[PATCH] D45202: [X86] Replacing X86-specific floor and ceil vector intrinsics with generic LLVM intrinsics
mike.dvoretsky created this revision. mike.dvoretsky added reviewers: craig.topper, spatel, RKSimon. Herald added a subscriber: cfe-commits. Currently, X86 floor and ceil intrinsics for vectors are implemented as target-specific intrinsics that use the generic rounding instruction of the corresponding vector processing feature (ROUND* or VRNDSCALE*). This patch replaces those specific cases with calls to target-independent @llvm.floor.* and @llvm.ceil.* intrinsics. This doesn't affect the resulting machine code, as those intrinsics are lowered to the same instructions, but exposes these specific rounding cases to generic optimizations. Repository: rC Clang https://reviews.llvm.org/D45202 Files: include/clang/Basic/BuiltinsX86.def lib/CodeGen/CGBuiltin.cpp lib/Headers/avx512fintrin.h lib/Headers/avxintrin.h lib/Headers/smmintrin.h test/CodeGen/avx-builtins.c test/CodeGen/avx512f-builtins.c test/CodeGen/sse41-builtins.c Index: test/CodeGen/sse41-builtins.c === --- test/CodeGen/sse41-builtins.c +++ test/CodeGen/sse41-builtins.c @@ -44,25 +44,29 @@ __m128d test_mm_ceil_pd(__m128d x) { // CHECK-LABEL: test_mm_ceil_pd - // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v2f64 + // CHECK-NOT: select return _mm_ceil_pd(x); } __m128 test_mm_ceil_ps(__m128 x) { // CHECK-LABEL: test_mm_ceil_ps - // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v4f32 + // CHECK-NOT: select return _mm_ceil_ps(x); } __m128d test_mm_ceil_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_ceil_sd - // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v2f64 + // CHECK: select return _mm_ceil_sd(x, y); } __m128 test_mm_ceil_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_ceil_ss - // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v4f32 + // CHECK: select return _mm_ceil_ss(x, y); } @@ -196,25 +200,29 @@ __m128d test_mm_floor_pd(__m128d x) { // CHECK-LABEL: test_mm_floor_pd - // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v2f64 + // CHECK-NOT: select return _mm_floor_pd(x); } __m128 test_mm_floor_ps(__m128 x) { // CHECK-LABEL: test_mm_floor_ps - // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v4f32 + // CHECK-NOT: select return _mm_floor_ps(x); } __m128d test_mm_floor_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_floor_sd - // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v2f64 + // CHECK: select return _mm_floor_sd(x, y); } __m128 test_mm_floor_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_floor_ss - // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v4f32 + // CHECK: select return _mm_floor_ss(x, y); } Index: test/CodeGen/avx512f-builtins.c === --- test/CodeGen/avx512f-builtins.c +++ test/CodeGen/avx512f-builtins.c @@ -7485,31 +7485,67 @@ return _mm512_min_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION); } +__m512 test_mm512_floor_ps(__m512 __A) +{ + // CHECK-LABEL: @test_mm512_floor_ps + // CHECK: @llvm.floor.v16f32 + // CHECK-NOT: select + return _mm512_floor_ps(__A); +} + +__m512d test_mm512_floor_pd(__m512d __A) +{ + // CHECK-LABEL: @test_mm512_floor_pd + // CHECK: @llvm.floor.v8f64 + // CHECK-NOT: select + return _mm512_floor_pd(__A); +} + __m512 test_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) { - // CHECK-LABEL: @test_mm512_mask_floor_ps - // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512 + // CHECK-LABEL: @test_mm512_mask_floor_ps + // CHECK: @llvm.floor.v16f32 + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_floor_ps (__W,__U,__A); } __m512d test_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) { - // CHECK-LABEL: @test_mm512_mask_floor_pd - // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512 + // CHECK-LABEL: @test_mm512_mask_floor_pd + // CHECK: @llvm.floor.v8f64 + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_floor_pd (__W,__U,__A); } +__m512 test_mm512_ceil_ps(__m512 __A) +{ + // CHECK-LABEL: @test_mm512_ceil_ps + // CHECK: @llvm.ceil.v16f32 + // CHECK-NOT: select + return _mm512_ceil_ps(__A); +} + +__m512d test_mm512_ceil_pd(__m512d __A) +{ + // CHECK-LABEL: @test_mm512_ceil_pd + // CHECK: @llvm.ceil.v8f64 + // CHECK-NOT: select + return _mm512_ceil_pd(__A); +} + __m512 test_
[PATCH] D45202: [X86] Replacing X86-specific floor and ceil vector intrinsics with generic LLVM intrinsics
mike.dvoretsky updated this revision to Diff 140972. mike.dvoretsky edited the summary of this revision. mike.dvoretsky added a comment. On suggestion from @craig.topper moved all lowering to CGBuiltin.cpp with no new builtins added. Instead the existing builtins are lowered if their immediate values correspond to generic ceil and floor operations. https://reviews.llvm.org/D45203 is now required to enable transformations. https://reviews.llvm.org/D45202 Files: clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/avx-builtins.c clang/test/CodeGen/avx512f-builtins.c clang/test/CodeGen/sse41-builtins.c Index: clang/test/CodeGen/sse41-builtins.c === --- clang/test/CodeGen/sse41-builtins.c +++ clang/test/CodeGen/sse41-builtins.c @@ -44,25 +44,29 @@ __m128d test_mm_ceil_pd(__m128d x) { // CHECK-LABEL: test_mm_ceil_pd - // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v2f64 + // CHECK-NOT: select return _mm_ceil_pd(x); } __m128 test_mm_ceil_ps(__m128 x) { // CHECK-LABEL: test_mm_ceil_ps - // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v4f32 + // CHECK-NOT: select return _mm_ceil_ps(x); } __m128d test_mm_ceil_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_ceil_sd - // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v2f64 + // CHECK: select return _mm_ceil_sd(x, y); } __m128 test_mm_ceil_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_ceil_ss - // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v4f32 + // CHECK: select return _mm_ceil_ss(x, y); } @@ -196,25 +200,29 @@ __m128d test_mm_floor_pd(__m128d x) { // CHECK-LABEL: test_mm_floor_pd - // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v2f64 + // CHECK-NOT: select return _mm_floor_pd(x); } __m128 test_mm_floor_ps(__m128 x) { // CHECK-LABEL: test_mm_floor_ps - // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v4f32 + // CHECK-NOT: select return _mm_floor_ps(x); } __m128d test_mm_floor_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_floor_sd - // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v2f64 + // CHECK: select return _mm_floor_sd(x, y); } __m128 test_mm_floor_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_floor_ss - // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v4f32 + // CHECK: select return _mm_floor_ss(x, y); } Index: clang/test/CodeGen/avx512f-builtins.c === --- clang/test/CodeGen/avx512f-builtins.c +++ clang/test/CodeGen/avx512f-builtins.c @@ -7485,46 +7485,98 @@ return _mm512_min_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION); } +__m512 test_mm512_floor_ps(__m512 __A) +{ + // CHECK-LABEL: @test_mm512_floor_ps + // CHECK: @llvm.floor.v16f32 + // CHECK-NOT: select + return _mm512_floor_ps(__A); +} + +__m512d test_mm512_floor_pd(__m512d __A) +{ + // CHECK-LABEL: @test_mm512_floor_pd + // CHECK: @llvm.floor.v8f64 + // CHECK-NOT: select + return _mm512_floor_pd(__A); +} + __m512 test_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) { - // CHECK-LABEL: @test_mm512_mask_floor_ps - // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512 + // CHECK-LABEL: @test_mm512_mask_floor_ps + // CHECK: @llvm.floor.v16f32 + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_floor_ps (__W,__U,__A); } __m512d test_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) { - // CHECK-LABEL: @test_mm512_mask_floor_pd - // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512 + // CHECK-LABEL: @test_mm512_mask_floor_pd + // CHECK: @llvm.floor.v8f64 + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_floor_pd (__W,__U,__A); } +__m512 test_mm512_ceil_ps(__m512 __A) +{ + // CHECK-LABEL: @test_mm512_ceil_ps + // CHECK: @llvm.ceil.v16f32 + // CHECK-NOT: select + return _mm512_ceil_ps(__A); +} + +__m512d test_mm512_ceil_pd(__m512d __A) +{ + // CHECK-LABEL: @test_mm512_ceil_pd + // CHECK: @llvm.ceil.v8f64 + // CHECK-NOT: select + return _mm512_ceil_pd(__A); +} + __m512 test_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) { - // CHECK-LABEL: @test_mm512_mask_ceil_ps - // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512 + // CHECK-LABEL: @test_mm512_mask_ceil_ps + // CHECK: @llvm.ceil.v16f32 + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float
[PATCH] D45202: [X86] Replacing X86-specific floor and ceil vector intrinsics with generic LLVM intrinsics
mike.dvoretsky added inline comments. Comment at: clang/lib/CodeGen/CGBuiltin.cpp:8307 +Dst = Ops[0]; +Mask = llvm::ConstantInt::get(CGF.Builder.getInt32Ty(), 1); + } else { craig.topper wrote: > I'm not sure we should even try to emit a mask for the legacy scalar > intrinsics. Does this get removed well by the middle or backend? The masking is done to represent all operations handled here in a uniform way. D45203 removes it in the backend. https://reviews.llvm.org/D45202 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits