Author: Rohit Aggarwal Date: 2024-02-15T12:13:07+05:30 New Revision: 36adfec155de366d722f2bac8ff9162289dcf06c
URL: https://github.com/llvm/llvm-project/commit/36adfec155de366d722f2bac8ff9162289dcf06c DIFF: https://github.com/llvm/llvm-project/commit/36adfec155de366d722f2bac8ff9162289dcf06c.diff LOG: Adding support of AMDLIBM vector library (#78560) Hi, AMD has it's own implementation of vector calls. This patch include the changes to enable the use of AMD's math library using -fveclib=AMDLIBM. Please refer https://github.com/amd/aocl-libm-ose --------- Co-authored-by: Rohit Aggarwal <rohit.aggar...@amd.com> Added: llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll Modified: clang/include/clang/Driver/Options.td clang/test/Driver/autocomplete.c llvm/include/llvm/Analysis/TargetLibraryInfo.h llvm/include/llvm/Analysis/VecFuncs.def llvm/include/llvm/Frontend/Driver/CodeGenOptions.h llvm/lib/Analysis/TargetLibraryInfo.cpp llvm/lib/Frontend/Driver/CodeGenOptions.cpp llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll llvm/test/Transforms/Util/add-TLI-mappings.ll Removed: ################################################################################ diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 95b464e7d61834..b302afd65e2811 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3204,10 +3204,10 @@ def fno_experimental_isel : Flag<["-"], "fno-experimental-isel">, Group<f_clang_ def fveclib : Joined<["-"], "fveclib=">, Group<f_Group>, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, HelpText<"Use the given vector functions library">, - Values<"Accelerate,libmvec,MASSV,SVML,SLEEF,Darwin_libsystem_m,ArmPL,none">, + Values<"Accelerate,libmvec,MASSV,SVML,SLEEF,Darwin_libsystem_m,ArmPL,AMDLIBM,none">, NormalizedValuesScope<"llvm::driver::VectorLibrary">, NormalizedValues<["Accelerate", "LIBMVEC", "MASSV", "SVML", "SLEEF", - "Darwin_libsystem_m", "ArmPL", "NoLibrary"]>, + "Darwin_libsystem_m", "ArmPL", "AMDLIBM", "NoLibrary"]>, MarshallingInfoEnum<CodeGenOpts<"VecLib">, "NoLibrary">; def fno_lax_vector_conversions : Flag<["-"], "fno-lax-vector-conversions">, Group<f_Group>, Alias<flax_vector_conversions_EQ>, AliasArgs<["none"]>; diff --git a/clang/test/Driver/autocomplete.c b/clang/test/Driver/autocomplete.c index d6f57708b67eb6..c8ceaaf404672f 100644 --- a/clang/test/Driver/autocomplete.c +++ b/clang/test/Driver/autocomplete.c @@ -80,6 +80,7 @@ // FLTOALL-NEXT: thin // RUN: %clang --autocomplete=-fveclib= | FileCheck %s -check-prefix=FVECLIBALL // FVECLIBALL: Accelerate +// FVECLIBALL-NEXT: AMDLIBM // FVECLIBALL-NEXT: ArmPL // FVECLIBALL-NEXT: Darwin_libsystem_m // FVECLIBALL-NEXT: libmvec diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h index daf1d8e2079f85..46f31f918e7b61 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h @@ -129,7 +129,8 @@ class TargetLibraryInfoImpl { MASSV, // IBM MASS vector library. SVML, // Intel short vector math library. SLEEFGNUABI, // SLEEF - SIMD Library for Evaluating Elementary Functions. - ArmPL // Arm Performance Libraries. + ArmPL, // Arm Performance Libraries. + AMDLIBM // AMD Math Vector library. }; TargetLibraryInfoImpl(); diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def index 07edf68c667a27..394e4a05fbc0cf 100644 --- a/llvm/include/llvm/Analysis/VecFuncs.def +++ b/llvm/include/llvm/Analysis/VecFuncs.def @@ -1067,6 +1067,199 @@ TLI_DEFINE_VECFUNC("tgammaf", "armpl_vtgammaq_f32", FIXED(4), NOMASK, "_ZGV_LLVM TLI_DEFINE_VECFUNC("tgamma", "armpl_svtgamma_f64_x", SCALABLE(2), MASKED, "_ZGVsMxv") TLI_DEFINE_VECFUNC("tgammaf", "armpl_svtgamma_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv") +#elif defined(TLI_DEFINE_AMDLIBM_VECFUNCS) +TLI_DEFINE_VECFUNC("sinf", "amd_vrs16_sinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("sinf", "amd_vrs8_sinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("sinf", "amd_vrs4_sinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("sin", "amd_vrd8_sin", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("sin", "amd_vrd4_sin", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("sin", "amd_vrd2_sin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") + +TLI_DEFINE_VECFUNC("llvm.sin.f32", "amd_vrs16_sinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("llvm.sin.f32", "amd_vrs8_sinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("llvm.sin.f32", "amd_vrs4_sinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.sin.f64", "amd_vrd8_sin", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("llvm.sin.f64", "amd_vrd4_sin", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.sin.f64", "amd_vrd2_sin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") + +TLI_DEFINE_VECFUNC("cosf", "amd_vrs16_cosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("cosf", "amd_vrs8_cosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("cosf", "amd_vrs4_cosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("cos", "amd_vrd8_cos", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("cos", "amd_vrd4_cos", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("cos", "amd_vrd2_cos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") + +TLI_DEFINE_VECFUNC("llvm.cos.f32", "amd_vrs16_cosf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("llvm.cos.f32", "amd_vrs8_cosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("llvm.cos.f32", "amd_vrs4_cosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.cos.f64", "amd_vrd8_cos", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("llvm.cos.f64", "amd_vrd4_cos", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.cos.f64", "amd_vrd2_cos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") + +TLI_DEFINE_VECFUNC("expf", "amd_vrs16_expf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("expf", "amd_vrs8_expf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("expf", "amd_vrs4_expf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("exp", "amd_vrd2_exp", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("exp", "amd_vrd4_exp", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("exp", "amd_vrd8_exp", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("__expf_finite", "amd_vrs16_expf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("__expf_finite", "amd_vrs8_expf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("__expf_finite", "amd_vrs4_expf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("__exp_finite", "amd_vrd2_exp", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("__exp_finite", "amd_vrd4_exp", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("__exp_finite", "amd_vrd8_exp", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("llvm.exp.f32", "amd_vrs16_expf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("llvm.exp.f32", "amd_vrs8_expf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("llvm.exp.f32", "amd_vrs4_expf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.exp.f64", "amd_vrd2_exp", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("llvm.exp.f64", "amd_vrd4_exp", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.exp.f64", "amd_vrd8_exp", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("exp2f", "amd_vrs16_exp2f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("exp2f", "amd_vrs8_exp2f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("exp2f", "amd_vrs4_exp2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("exp2", "amd_vrd2_exp2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("exp2", "amd_vrd4_exp2", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("exp2", "amd_vrd8_exp2", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("__exp2f_finite", "amd_vrs16_exp2f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("__exp2f_finite", "amd_vrs8_exp2f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("__exp2f_finite", "amd_vrs4_exp2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("__exp2_finite", "amd_vrd2_exp2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("__exp2_finite", "amd_vrd4_exp2", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("__exp2_finite", "amd_vrd8_exp2", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("llvm.exp2.f32", "amd_vrs16_exp2f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("llvm.exp2.f32", "amd_vrs8_exp2f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("llvm.exp2.f32", "amd_vrs4_exp2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.exp2.f64", "amd_vrd2_exp2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("llvm.exp2.f64", "amd_vrd4_exp2", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.exp2.f64", "amd_vrd8_exp2", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("powf", "amd_vrs16_powf", FIXED(16), NOMASK, "_ZGV_LLVM_N16vv") +TLI_DEFINE_VECFUNC("powf", "amd_vrs8_powf", FIXED(8), NOMASK, "_ZGV_LLVM_N8vv") +TLI_DEFINE_VECFUNC("powf", "amd_vrs4_powf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv") +TLI_DEFINE_VECFUNC("pow", "amd_vrd2_pow", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv") +TLI_DEFINE_VECFUNC("pow", "amd_vrd4_pow", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv") +TLI_DEFINE_VECFUNC("pow", "amd_vrd8_pow", FIXED(8), NOMASK, "_ZGV_LLVM_N8vv") + +TLI_DEFINE_VECFUNC("__powf_finite", "amd_vrs16_powf", FIXED(16), NOMASK, "_ZGV_LLVM_N16vv") +TLI_DEFINE_VECFUNC("__powf_finite", "amd_vrs8_powf", FIXED(8), NOMASK, "_ZGV_LLVM_N8vv") +TLI_DEFINE_VECFUNC("__powf_finite", "amd_vrs4_powf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv") +TLI_DEFINE_VECFUNC("__pow_finite", "amd_vrd2_pow", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv") +TLI_DEFINE_VECFUNC("__pow_finite", "amd_vrd4_pow", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv") +TLI_DEFINE_VECFUNC("__pow_finite", "amd_vrd8_pow", FIXED(8), NOMASK, "_ZGV_LLVM_N8vv") + +TLI_DEFINE_VECFUNC("llvm.pow.f32", "amd_vrs16_powf", FIXED(16), NOMASK, "_ZGV_LLVM_N16vv") +TLI_DEFINE_VECFUNC("llvm.pow.f32", "amd_vrs8_powf", FIXED(8), NOMASK, "_ZGV_LLVM_N8vv") +TLI_DEFINE_VECFUNC("llvm.pow.f32", "amd_vrs4_powf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv") +TLI_DEFINE_VECFUNC("llvm.pow.f64", "amd_vrd2_pow", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv") +TLI_DEFINE_VECFUNC("llvm.pow.f64", "amd_vrd4_pow", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv") +TLI_DEFINE_VECFUNC("llvm.pow.f64", "amd_vrd8_pow", FIXED(8), NOMASK, "_ZGV_LLVM_N8vv") + +TLI_DEFINE_VECFUNC("logf", "amd_vrs16_logf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("logf", "amd_vrs8_logf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("logf", "amd_vrs4_logf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("log", "amd_vrd2_log", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("log", "amd_vrd4_log", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("log", "amd_vrd8_log", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("__logf_finite", "amd_vrs16_logf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("__logf_finite", "amd_vrs8_logf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("__logf_finite", "amd_vrs4_logf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("__log_finite", "amd_vrd2_log", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("__log_finite", "amd_vrd4_log", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("__log_finite", "amd_vrd8_log", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("llvm.log.f32", "amd_vrs16_logf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("llvm.log.f32", "amd_vrs8_logf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("llvm.log.f32", "amd_vrs4_logf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.log.f64", "amd_vrd2_log", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("llvm.log.f64", "amd_vrd4_log", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.log.f64", "amd_vrd8_log", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("log2f", "amd_vrs16_log2f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("log2f", "amd_vrs8_log2f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("log2f", "amd_vrs4_log2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("log2", "amd_vrd2_log2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("log2", "amd_vrd4_log2", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("log2", "amd_vrd8_log2", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("__log2f_finite", "amd_vrs16_log2f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("__log2f_finite", "amd_vrs8_log2f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("__log2f_finite", "amd_vrs4_log2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("__log2_finite", "amd_vrd2_log2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("__log2_finite", "amd_vrd4_log2", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("__log2_finite", "amd_vrd8_log2", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("llvm.log2.f32", "amd_vrs16_log2f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("llvm.log2.f32", "amd_vrs8_log2f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("llvm.log2.f32", "amd_vrs4_log2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd2_log2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd4_log2", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.log2.f64", "amd_vrd8_log2", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("log10f", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("log10f", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("log10f", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") + +TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("__log10f_finite", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") + +TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs16_log10f", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs8_log10f", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("llvm.log10.f32", "amd_vrs4_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") + +TLI_DEFINE_VECFUNC("erff", "amd_vrs4_erff", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("erff", "amd_vrs8_erff", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("erff", "amd_vrs16_erff", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("erf", "amd_vrd2_erf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("erf", "amd_vrd4_erf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("erf", "amd_vrd8_erf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("exp10", "amd_vrd2_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("exp10f", "amd_vrs4_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") + +TLI_DEFINE_VECFUNC("expm1", "amd_vrd2_expm1", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("expm1f", "amd_vrs4_expm1f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") + +TLI_DEFINE_VECFUNC("log1p", "amd_vrd2_log1p", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("log1pf", "amd_vrs4_log1pf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") + +TLI_DEFINE_VECFUNC("tan", "amd_vrd2_tan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("tan", "amd_vrd4_tan", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("tan", "amd_vrd8_tan", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("tanf", "amd_vrs4_tanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("tanf", "amd_vrs8_tanf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("tanf", "amd_vrs16_tanf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") + +TLI_DEFINE_VECFUNC("asin", "amd_vrd8_asin", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("asinf", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("asinf", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("asinf", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") + +TLI_DEFINE_VECFUNC("acosf", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("acosf", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("atan", "amd_vrd2_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("atan", "amd_vrd4_atan", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("atan", "amd_vrd8_atan", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("atanf", "amd_vrs4_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("atanf", "amd_vrs8_atanf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("atanf", "amd_vrs16_atanf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") + +TLI_DEFINE_VECFUNC("coshf", "amd_vrs4_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("coshf", "amd_vrs8_coshf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("tanhf", "amd_vrs4_tanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("tanhf", "amd_vrs8_tanhf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("cbrt", "amd_vrd2_cbrt", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("cbrtf", "amd_vrs4_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") + #else #error "Must choose which vector library functions are to be defined." #endif @@ -1087,3 +1280,4 @@ TLI_DEFINE_VECFUNC("tgammaf", "armpl_svtgamma_f32_x", SCALABLE(4), MASKED, "_ZGV #undef TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS #undef TLI_DEFINE_MASSV_VECFUNCS_NAMES #undef TLI_DEFINE_ARMPL_VECFUNCS +#undef TLI_DEFINE_AMDLIBM_VECFUNCS diff --git a/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h b/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h index 0b1d924a26b2de..0180670c4c6991 100644 --- a/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h +++ b/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h @@ -29,7 +29,8 @@ enum class VectorLibrary { SVML, // Intel short vector math library. SLEEF, // SLEEF SIMD Library for Evaluating Elementary Functions. Darwin_libsystem_m, // Use Darwin's libsystem_m vector functions. - ArmPL // Arm Performance Libraries. + ArmPL, // Arm Performance Libraries. + AMDLIBM // AMD vector math library. }; TargetLibraryInfoImpl *createTLII(llvm::Triple &TargetTriple, diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index 25951d2a7fe63c..710762a6c0ad1d 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -37,7 +37,9 @@ static cl::opt<TargetLibraryInfoImpl::VectorLibrary> ClVectorLibrary( clEnumValN(TargetLibraryInfoImpl::SLEEFGNUABI, "sleefgnuabi", "SIMD Library for Evaluating Elementary Functions"), clEnumValN(TargetLibraryInfoImpl::ArmPL, "ArmPL", - "Arm Performance Libraries"))); + "Arm Performance Libraries"), + clEnumValN(TargetLibraryInfoImpl::AMDLIBM, "AMDLIBM", + "AMD vector math library"))); StringLiteral const TargetLibraryInfoImpl::StandardNames[LibFunc::NumLibFuncs] = { @@ -1273,6 +1275,16 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( } break; } + case AMDLIBM: { + const VecDesc VecFuncs[] = { +#define TLI_DEFINE_AMDLIBM_VECFUNCS +#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX) \ + {SCAL, VEC, VF, MASK, VABI_PREFIX}, +#include "llvm/Analysis/VecFuncs.def" + }; + addVectorizableFunctions(VecFuncs); + break; + } case NoLibrary: break; } diff --git a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp index 96c5b19a4a5913..2d74a91f62dc07 100644 --- a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp +++ b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp @@ -46,6 +46,10 @@ TargetLibraryInfoImpl *createTLII(llvm::Triple &TargetTriple, TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::ArmPL, TargetTriple); break; + case VectorLibrary::AMDLIBM: + TLII->addVectorizableFunctionsFromVecLib(TargetLibraryInfoImpl::AMDLIBM, + TargetTriple); + break; default: break; } diff --git a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll index df8b7c498bd002..fde6cb788b46f9 100644 --- a/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll +++ b/llvm/test/CodeGen/Generic/replace-intrinsics-with-veclib.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes ; RUN: opt -vector-library=SVML -replace-with-veclib -S < %s | FileCheck %s --check-prefixes=COMMON,SVML +; RUN: opt -vector-library=AMDLIBM -replace-with-veclib -S < %s | FileCheck %s --check-prefixes=COMMON,AMDLIBM ; RUN: opt -vector-library=LIBMVEC-X86 -replace-with-veclib -S < %s | FileCheck %s --check-prefixes=COMMON,LIBMVEC-X86 ; RUN: opt -vector-library=MASSV -replace-with-veclib -S < %s | FileCheck %s --check-prefixes=COMMON,MASSV ; RUN: opt -vector-library=Accelerate -replace-with-veclib -S < %s | FileCheck %s --check-prefixes=COMMON,ACCELERATE @@ -13,6 +14,11 @@ define <4 x double> @exp_v4(<4 x double> %in) { ; SVML-NEXT: [[TMP1:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[IN]]) ; SVML-NEXT: ret <4 x double> [[TMP1]] ; +; AMDLIBM-LABEL: define {{[^@]+}}@exp_v4 +; AMDLIBM-SAME: (<4 x double> [[IN:%.*]]) { +; AMDLIBM-NEXT: [[TMP1:%.*]] = call <4 x double> @amd_vrd4_exp(<4 x double> [[IN]]) +; AMDLIBM-NEXT: ret <4 x double> [[TMP1]] +; ; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_v4 ; LIBMVEC-X86-SAME: (<4 x double> [[IN:%.*]]) { ; LIBMVEC-X86-NEXT: [[TMP1:%.*]] = call <4 x double> @_ZGVdN4v_exp(<4 x double> [[IN]]) @@ -40,6 +46,11 @@ define <4 x float> @exp_f32(<4 x float> %in) { ; SVML-NEXT: [[TMP1:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[IN]]) ; SVML-NEXT: ret <4 x float> [[TMP1]] ; +; AMDLIBM-LABEL: define {{[^@]+}}@exp_f32 +; AMDLIBM-SAME: (<4 x float> [[IN:%.*]]) { +; AMDLIBM-NEXT: [[TMP1:%.*]] = call <4 x float> @amd_vrs4_expf(<4 x float> [[IN]]) +; AMDLIBM-NEXT: ret <4 x float> [[TMP1]] +; ; LIBMVEC-X86-LABEL: define {{[^@]+}}@exp_f32 ; LIBMVEC-X86-SAME: (<4 x float> [[IN:%.*]]) { ; LIBMVEC-X86-NEXT: [[TMP1:%.*]] = call <4 x float> @_ZGVbN4v_expf(<4 x float> [[IN]]) diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll new file mode 100644 index 00000000000000..54bb9352f3c89c --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls-finite.ll @@ -0,0 +1,332 @@ +; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s + +; Test to verify that when math headers are built with +; __FINITE_MATH_ONLY__ enabled, causing use of __<func>_finite +; function versions, vectorization can map these to vector versions. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare float @__expf_finite(float) #0 + +; CHECK-LABEL: @exp_f32 +; CHECK: <4 x float> @amd_vrs4_expf +; CHECK: ret +define void @exp_f32(ptr nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call fast float @__expf_finite(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv + store float %call, ptr %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1 + +for.end: ; preds = %for.body + ret void +} + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} + + +declare double @__exp_finite(double) #0 + +; CHECK-LABEL: @exp_f64 +; CHECK: <4 x double> @amd_vrd4_exp +; CHECK: ret +define void @exp_f64(ptr nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call fast double @__exp_finite(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %indvars.iv + store double %call, ptr %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !11 + +for.end: ; preds = %for.body + ret void +} + +!11 = distinct !{!11, !12, !13} +!12 = !{!"llvm.loop.vectorize.width", i32 4} +!13 = !{!"llvm.loop.vectorize.enable", i1 true} + + + + +declare float @__logf_finite(float) #0 + +; CHECK-LABEL: @log_f32 +; CHECK: <4 x float> @amd_vrs4_logf +; CHECK: ret +define void @log_f32(ptr nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call fast float @__logf_finite(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv + store float %call, ptr %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21 + +for.end: ; preds = %for.body + ret void +} + +!21 = distinct !{!21, !22, !23} +!22 = !{!"llvm.loop.vectorize.width", i32 4} +!23 = !{!"llvm.loop.vectorize.enable", i1 true} + + +declare double @__log_finite(double) #0 + +; CHECK-LABEL: @log_f64 +; CHECK: <4 x double> @amd_vrd4_log +; CHECK: ret +define void @log_f64(ptr nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call fast double @__log_finite(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %indvars.iv + store double %call, ptr %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31 + +for.end: ; preds = %for.body + ret void +} + +!31 = distinct !{!31, !32, !33} +!32 = !{!"llvm.loop.vectorize.width", i32 4} +!33 = !{!"llvm.loop.vectorize.enable", i1 true} + + +declare float @__powf_finite(float, float) #0 + +; CHECK-LABEL: @pow_f32 +; CHECK: <4 x float> @amd_vrs4_powf +; CHECK: ret +define void @pow_f32(ptr nocapture %varray, ptr nocapture readonly %exp) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to float + %arrayidx = getelementptr inbounds float, ptr %exp, i64 %indvars.iv + %tmp1 = load float, ptr %arrayidx, align 4 + %tmp2 = tail call fast float @__powf_finite(float %conv, float %tmp1) + %arrayidx2 = getelementptr inbounds float, ptr %varray, i64 %indvars.iv + store float %tmp2, ptr %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !41 + +for.end: ; preds = %for.body + ret void +} + +!41 = distinct !{!41, !42, !43} +!42 = !{!"llvm.loop.vectorize.width", i32 4} +!43 = !{!"llvm.loop.vectorize.enable", i1 true} + + +declare double @__pow_finite(double, double) #0 + +; CHECK-LABEL: @pow_f64 +; CHECK: <4 x double> @amd_vrd4_pow +; CHECK: ret +define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to double + %arrayidx = getelementptr inbounds double, ptr %exp, i64 %indvars.iv + %tmp1 = load double, ptr %arrayidx, align 4 + %tmp2 = tail call fast double @__pow_finite(double %conv, double %tmp1) + %arrayidx2 = getelementptr inbounds double, ptr %varray, i64 %indvars.iv + store double %tmp2, ptr %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !51 + +for.end: ; preds = %for.body + ret void +} + +!51 = distinct !{!51, !52, !53} +!52 = !{!"llvm.loop.vectorize.width", i32 4} +!53 = !{!"llvm.loop.vectorize.enable", i1 true} + +declare float @__exp2f_finite(float) #0 + +define void @exp2f_finite(ptr nocapture %varray) { +; CHECK-LABEL: @exp2f_finite( +; CHECK: call <4 x float> @amd_vrs4_exp2f(<4 x float> %{{.*}}) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @__exp2f_finite(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !61 + +for.end: + ret void +} + +!61 = distinct !{!61, !62, !63} +!62 = !{!"llvm.loop.vectorize.width", i32 4} +!63 = !{!"llvm.loop.vectorize.enable", i1 true} + +declare double @__exp2_finite(double) #0 + +define void @exp2_finite(ptr nocapture %varray) { +; CHECK-LABEL: @exp2_finite( +; CHECK: call <4 x double> @amd_vrd4_exp2(<4 x double> {{.*}}) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @__exp2_finite(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !71 + +for.end: + ret void +} + +!71 = distinct !{!71, !72, !73} +!72 = !{!"llvm.loop.vectorize.width", i32 4} +!73 = !{!"llvm.loop.vectorize.enable", i1 true} + +declare float @__log2f_finite(float) #0 + +; CHECK-LABEL: @log2_f32 +; CHECK: <4 x float> @amd_vrs4_log2f +; CHECK: ret +define void @log2_f32(ptr nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call fast float @__log2f_finite(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv + store float %call, ptr %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21 + +for.end: ; preds = %for.body + ret void +} + +!81 = distinct !{!21, !22, !23} +!82 = !{!"llvm.loop.vectorize.width", i32 4} +!83 = !{!"llvm.loop.vectorize.enable", i1 true} + + +declare double @__log2_finite(double) #0 + +; CHECK-LABEL: @log2_f64 +; CHECK: <4 x double> @amd_vrd4_log2 +; CHECK: ret +define void @log2_f64(ptr nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call fast double @__log2_finite(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %indvars.iv + store double %call, ptr %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31 + +for.end: ; preds = %for.body + ret void +} + +!91 = distinct !{!31, !32, !33} +!92 = !{!"llvm.loop.vectorize.width", i32 4} +!93 = !{!"llvm.loop.vectorize.enable", i1 true} + +declare float @__log10f_finite(float) #0 + +; CHECK-LABEL: @log10_f32 +; CHECK: <4 x float> @amd_vrs4_log10f +; CHECK: ret +define void @log10_f32(ptr nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call fast float @__log10f_finite(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %indvars.iv + store float %call, ptr %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21 + +for.end: ; preds = %for.body + ret void +} + +!101 = distinct !{!21, !22, !23} +!102 = !{!"llvm.loop.vectorize.width", i32 4} +!103 = !{!"llvm.loop.vectorize.enable", i1 true} + + diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll new file mode 100644 index 00000000000000..8d2820a245d952 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll @@ -0,0 +1,869 @@ +; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s +; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=+avx512f -S < %s | FileCheck %s --check-prefix=CHECK-AVX512-VF8 +; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -mattr=+avx512f -S < %s | FileCheck %s --check-prefix=CHECK-AVX512-VF16 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare double @sin(double) #0 +declare float @sinf(float) #0 +declare double @llvm.sin.f64(double) #0 +declare float @llvm.sin.f32(float) #0 + +declare double @cos(double) #0 +declare float @cosf(float) #0 +declare double @llvm.cos.f64(double) #0 +declare float @llvm.cos.f32(float) #0 + +declare double @pow(double, double) #0 +declare float @powf(float, float) #0 +declare double @llvm.pow.f64(double, double) #0 +declare float @llvm.pow.f32(float, float) #0 + +declare double @exp(double) #0 +declare float @expf(float) #0 +declare double @llvm.exp.f64(double) #0 +declare float @llvm.exp.f32(float) #0 + +declare double @log(double) #0 +declare float @logf(float) #0 +declare double @llvm.log.f64(double) #0 +declare float @llvm.log.f32(float) #0 + +declare double @log2(double) #0 +declare float @log2f(float) #0 +declare double @llvm.log2.f64(double) #0 +declare float @llvm.log2.f32(float) #0 + +declare double @log10(double) #0 +declare float @log10f(float) #0 +declare double @llvm.log10.f64(double) #0 +declare float @llvm.log10.f32(float) #0 + +declare double @sqrt(double) #0 +declare float @sqrtf(float) #0 + +declare double @exp2(double) #0 +declare float @exp2f(float) #0 +declare double @llvm.exp2.f64(double) #0 +declare float @llvm.exp2.f32(float) #0 + +define void @sin_f64(ptr nocapture %varray) { +; CHECK-LABEL: @sin_f64( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_sin(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @sin_f64( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_sin(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @sin(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @sin_f32(ptr nocapture %varray) { +; CHECK-LABEL: @sin_f32( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_sinf(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @sin_f32( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_sinf(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @sinf(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @sin_f64_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @sin_f64_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_sin(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @sin_f64_intrinsic( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_sin(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.sin.f64(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @sin_f32_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @sin_f32_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_sinf(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @sin_f32_intrinsic( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_sinf(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.sin.f32(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @cos_f64(ptr nocapture %varray) { +; CHECK-LABEL: @cos_f64( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_cos(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @cos_f64( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_cos(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @cos(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @cos_f32(ptr nocapture %varray) { +; CHECK-LABEL: @cos_f32( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_cosf(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @cos_f32( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_cosf(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @cosf(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @cos_f64_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @cos_f64_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_cos(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @cos_f64_intrinsic( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_cos(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.cos.f64(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @cos_f32_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @cos_f32_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_cosf(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @cos_f32_intrinsic( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_cosf(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.cos.f32(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) { +; CHECK-LABEL: @pow_f64( +; CHECK: [[TMP8:%.*]] = call <4 x double> @amd_vrd4_pow(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @pow_f64( +; CHECK-AVX512-VF8: [[TMP8:%.*]] = call <8 x double> @amd_vrd8_pow(<8 x double> [[TMP4:%.*]], <8 x double> [[WIDE_LOAD:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %arrayidx = getelementptr inbounds double, ptr %exp, i64 %iv + %tmp1 = load double, ptr %arrayidx, align 4 + %tmp2 = tail call double @pow(double %conv, double %tmp1) + %arrayidx2 = getelementptr inbounds double, ptr %varray, i64 %iv + store double %tmp2, ptr %arrayidx2, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @pow_f64_intrinsic(ptr nocapture %varray, ptr nocapture readonly %exp) { +; CHECK-LABEL: @pow_f64_intrinsic( +; CHECK: [[TMP8:%.*]] = call <4 x double> @amd_vrd4_pow(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @pow_f64_intrinsic( +; CHECK-AVX512-VF8: [[TMP8:%.*]] = call <8 x double> @amd_vrd8_pow(<8 x double> [[TMP4:%.*]], <8 x double> [[WIDE_LOAD:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %arrayidx = getelementptr inbounds double, ptr %exp, i64 %iv + %tmp1 = load double, ptr %arrayidx, align 4 + %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1) + %arrayidx2 = getelementptr inbounds double, ptr %varray, i64 %iv + store double %tmp2, ptr %arrayidx2, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @pow_f32(ptr nocapture %varray, ptr nocapture readonly %exp) { +; CHECK-LABEL: @pow_f32( +; CHECK: [[TMP8:%.*]] = call <4 x float> @amd_vrs4_powf(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @pow_f32( +; CHECK-AVX512-VF16: [[TMP8:%.*]] = call <16 x float> @amd_vrs16_powf(<16 x float> [[TMP4:%.*]], <16 x float> [[WIDE_LOAD:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %arrayidx = getelementptr inbounds float, ptr %exp, i64 %iv + %tmp1 = load float, ptr %arrayidx, align 4 + %tmp2 = tail call float @powf(float %conv, float %tmp1) + %arrayidx2 = getelementptr inbounds float, ptr %varray, i64 %iv + store float %tmp2, ptr %arrayidx2, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @pow_f32_intrinsic(ptr nocapture %varray, ptr nocapture readonly %exp) { +; CHECK-LABEL: @pow_f32_intrinsic( +; CHECK: [[TMP8:%.*]] = call <4 x float> @amd_vrs4_powf(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @pow_f32_intrinsic( +; CHECK-AVX512-VF16: [[TMP8:%.*]] = call <16 x float> @amd_vrs16_powf(<16 x float> [[TMP4:%.*]], <16 x float> [[WIDE_LOAD:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %arrayidx = getelementptr inbounds float, ptr %exp, i64 %iv + %tmp1 = load float, ptr %arrayidx, align 4 + %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1) + %arrayidx2 = getelementptr inbounds float, ptr %varray, i64 %iv + store float %tmp2, ptr %arrayidx2, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @exp_f64(ptr nocapture %varray) { +; CHECK-LABEL: @exp_f64( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @exp_f64( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @exp(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @exp_f32(ptr nocapture %varray) { +; CHECK-LABEL: @exp_f32( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_expf(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @exp_f32( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_expf(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @expf(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @exp_f64_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @exp_f64_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @exp_f64_intrinsic( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.exp.f64(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @exp_f32_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @exp_f32_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_expf(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @exp_f32_intrinsic( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_expf(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.exp.f32(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log_f64(ptr nocapture %varray) { +; CHECK-LABEL: @log_f64( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_log(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @log_f64( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_log(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @log(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log_f32(ptr nocapture %varray) { +; CHECK-LABEL: @log_f32( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_logf(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @log_f32( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_logf(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @logf(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log_f64_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @log_f64_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_log(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @log_f64_intrinsic( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_log(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.log.f64(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log_f32_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @log_f32_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_logf(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @log_f32_intrinsic( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_logf(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.log.f32(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log2_f64(ptr nocapture %varray) { +; CHECK-LABEL: @log2_f64( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_log2(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @log2_f64( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_log2(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @log2(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log2_f32(ptr nocapture %varray) { +; CHECK-LABEL: @log2_f32( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_log2f(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @log2_f32( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_log2f(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @log2f(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log2_f64_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @log2_f64_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_log2(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @log2_f64_intrinsic( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_log2(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.log2.f64(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log2_f32_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @log2_f32_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_log2f(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @log2_f32_intrinsic( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_log2f(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.log2.f32(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log10_f32(ptr nocapture %varray) { +; CHECK-LABEL: @log10_f32( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_log10f(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @log10_f32( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_log10f(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @log10f(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @log10_f32_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @log10_f32_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_log10f(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @log10_f32_intrinsic( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_log10f(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.log10.f32(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @exp2_f64(ptr nocapture %varray) { +; CHECK-LABEL: @exp2_f64( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp2(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @exp2_f64( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp2(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @exp2(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @exp2_f32(ptr nocapture %varray) { +; CHECK-LABEL: @exp2_f32( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_exp2f(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @exp2_f32( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_exp2f(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @exp2f(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @exp2_f64_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @exp2_f64_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_exp2(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @exp2_f64_intrinsic( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_exp2(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.exp2.f64(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @exp2_f32_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @exp2_f32_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_exp2f(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @exp2_f32_intrinsic( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_exp2f(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.exp2.f32(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll index c02b031c39839b..9810d50beea736 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skylake-avx512 -passes=slp-vectorizer -S | FileCheck %s ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skylake-avx512 -passes=inject-tli-mappings,slp-vectorizer -vector-library=SVML -S | FileCheck %s --check-prefix=VECLIB +; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skylake-avx512 -passes=inject-tli-mappings,slp-vectorizer -vector-library=AMDLIBM -S | FileCheck %s --check-prefix=AMDLIBM @src = common global [8 x double] zeroinitializer, align 64 @dst = common global [8 x double] zeroinitializer, align 64 @@ -63,7 +64,33 @@ define void @test() { ; VECLIB-NEXT: store <2 x double> [[TMP15]], ptr @dst, align 8 ; VECLIB-NEXT: ret void ; - +; AMDLIBM-LABEL: @test( +; AMDLIBM-NEXT: [[A0:%.*]] = load double, ptr @src, align 8 +; AMDLIBM-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8 +; AMDLIBM-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 +; AMDLIBM-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8 +; AMDLIBM-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 +; AMDLIBM-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8 +; AMDLIBM-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8 +; AMDLIBM-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8 +; AMDLIBM-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 +; AMDLIBM-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1 +; AMDLIBM-NEXT: [[TMP3:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP2]]) +; AMDLIBM-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0 +; AMDLIBM-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1 +; AMDLIBM-NEXT: [[TMP6:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP5]]) +; AMDLIBM-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; AMDLIBM-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1 +; AMDLIBM-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) +; AMDLIBM-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0 +; AMDLIBM-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1 +; AMDLIBM-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]]) +; AMDLIBM-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] +; AMDLIBM-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]] +; AMDLIBM-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]] +; AMDLIBM-NEXT: store <2 x double> [[TMP15]], ptr @dst, align 8 +; AMDLIBM-NEXT: ret void +; %a0 = load double, ptr @src, align 8 %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8 %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll index 7b12de90319012..d86e44f199b391 100644 --- a/llvm/test/Transforms/Util/add-TLI-mappings.ll +++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll @@ -1,4 +1,5 @@ ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=SVML -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,SVML +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=AMDLIBM -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,AMDLIBM ; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -vector-library=MASSV -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,MASSV ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=LIBMVEC-X86 -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,LIBMVEC-X86 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=Accelerate -passes=inject-tli-mappings -S < %s | FileCheck %s --check-prefixes=COMMON,ACCELERATE @@ -13,6 +14,13 @@ ; SVML-SAME: ptr @__svml_log10f4, ; SVML-SAME: ptr @__svml_log10f8, ; SVML-SAME: ptr @__svml_log10f16 +; AMDLIBM-SAME: [6 x ptr] [ +; AMDLIBM-SAME: ptr @amd_vrd2_sin, +; AMDLIBM-SAME: ptr @amd_vrd4_sin, +; AMDLIBM-SAME: ptr @amd_vrd8_sin, +; AMDLIBM-SAME: ptr @amd_vrs4_log10f, +; AMDLIBM-SAME: ptr @amd_vrs8_log10f, +; AMDLIBM-SAME: ptr @amd_vrs16_log10f ; MASSV-SAME: [2 x ptr] [ ; MASSV-SAME: ptr @__sind2, ; MASSV-SAME: ptr @__log10f4 @@ -74,6 +82,7 @@ declare float @modff(float, ptr) #0 define double @sin_f64(double %in) { ; COMMON-LABEL: @sin_f64( ; SVML: call double @sin(double %{{.*}}) #[[SIN:[0-9]+]] +; AMDLIBM: call double @sin(double %{{.*}}) #[[SIN:[0-9]+]] ; MASSV: call double @sin(double %{{.*}}) #[[SIN:[0-9]+]] ; ACCELERATE: call double @sin(double %{{.*}}) ; LIBMVEC-X86: call double @sin(double %{{.*}}) #[[SIN:[0-9]+]] @@ -130,6 +139,7 @@ declare void @sincospif(float, ptr, ptr) #0 define float @call_llvm.log10.f32(float %in) { ; COMMON-LABEL: @call_llvm.log10.f32( ; SVML: call float @llvm.log10.f32(float %{{.*}}) +; AMDLIBM: call float @llvm.log10.f32(float %{{.*}}) ; LIBMVEC-X86: call float @llvm.log10.f32(float %{{.*}}) ; MASSV: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]] ; ACCELERATE: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]] @@ -137,6 +147,7 @@ define float @call_llvm.log10.f32(float %in) { ; ARMPL: call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]] ; No mapping of "llvm.log10.f32" to a vector function for SVML. ; SVML-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}}) +; AMDLIBM-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}}) ; LIBMVEC-X86-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}}) %call = tail call float @llvm.log10.f32(float %in) ret float %call @@ -151,6 +162,13 @@ declare float @llvm.log10.f32(float) #0 ; SVML: declare <8 x float> @__svml_log10f8(<8 x float>) ; SVML: declare <16 x float> @__svml_log10f16(<16 x float>) +; AMDLIBM: declare <2 x double> @amd_vrd2_sin(<2 x double>) +; AMDLIBM: declare <4 x double> @amd_vrd4_sin(<4 x double>) +; AMDLIBM: declare <8 x double> @amd_vrd8_sin(<8 x double>) +; AMDLIBM: declare <4 x float> @amd_vrs4_log10f(<4 x float>) +; AMDLIBM: declare <8 x float> @amd_vrs8_log10f(<8 x float>) +; AMDLIBM: declare <16 x float> @amd_vrs16_log10f(<16 x float>) + ; MASSV: declare <2 x double> @__sind2(<2 x double>) ; MASSV: declare <4 x float> @__log10f4(<4 x float>) @@ -194,6 +212,11 @@ attributes #0 = { nounwind readnone } ; SVML-SAME: _ZGV_LLVM_N4v_sin(__svml_sin4), ; SVML-SAME: _ZGV_LLVM_N8v_sin(__svml_sin8)" } +; AMDLIBM: attributes #[[SIN]] = { "vector-function-abi-variant"= +; AMDLIBM-SAME: "_ZGV_LLVM_N2v_sin(amd_vrd2_sin), +; AMDLIBM-SAME: _ZGV_LLVM_N4v_sin(amd_vrd4_sin), +; AMDLIBM-SAME: _ZGV_LLVM_N8v_sin(amd_vrd8_sin)" } + ; MASSV: attributes #[[SIN]] = { "vector-function-abi-variant"= ; MASSV-SAME: "_ZGV_LLVM_N2v_sin(__sind2)" } ; MASSV: attributes #[[LOG10]] = { "vector-function-abi-variant"= _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits