[clang] [clang][AArch64] Add getHostCPUFeatures to query for enabled features in cpu info (PR #97749)
davemgreen wrote: It is that bit of code, yeah. I don't know of a way to reproduce this without logging into different machines with different sets of options and trying it. If we had a way to test/mock that various /proc/cpuinfo files gave us the correct results, that would be helpful in giving us more confidence it was working as intended. https://github.com/llvm/llvm-project/pull/97749 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Implement NEON vamin/vamax intrinsics (PR #99041)
https://github.com/davemgreen commented: Did you consider emitting `llvm.fmin(llvm.fabs(x), llvm.fabs(y))`? https://github.com/llvm/llvm-project/pull/99041 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [AArch64] Add getHostCPUFeatures to query for enabled features in cpu… (PR #97749)
davemgreen wrote: Hi this sounds like a good idea to me. Note that the implementation of getHostCPUFeatures isn't amazing for AArch64 at the moment, there was an attempt to fix it up in #95694 (but thaat has gone a bit quiet). One point we noticed is that it could end up turning "aes+sha2" into "crypto" and "crypto" back into "sha2+aes+sha3+sm4", as it uses the old meaning of "crypto" https://github.com/llvm/llvm-project/pull/97749 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][NEON] Add intrinsics for LUTI (PR #96883)
https://github.com/davemgreen commented: Thanks this looks great. I've not checked the C / ACLE intrinsics though - I will defer to @CarolineConcatto and @momchil-velikov for those parts if that is OK. https://github.com/llvm/llvm-project/pull/96883 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][NEON] Add intrinsics for LUTI (PR #96883)
@@ -6420,6 +6420,76 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd), let Predicates = [HasLUT] in { defm LUT2 : BaseSIMDTableLookupIndexed2<"luti2">; defm LUT4 : BaseSIMDTableLookupIndexed4<"luti4">; + + def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v8i8 V64:$Rn), + (v8i8 V64:$Rm), (i32 VectorIndexS32b_timm:$idx))), + (LUT2_B (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>; + def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v8i8 V64:$Rn), + (v16i8 V128:$Rm), (i32 VectorIndexS32b_timm:$idx))), + (LUT2_B (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm, VectorIndexS32b_timm:$idx)>; + def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v16i8 V128:$Rn), + (v8i8 V64:$Rm), (i32 VectorIndexS32b_timm:$idx))), + (LUT2_B V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>; + def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v16i8 V128:$Rn), + (v16i8 V128:$Rm), (i32 VectorIndexS32b_timm:$idx))), + (LUT2_B V128:$Rn, V128:$Rm, VectorIndexS32b_timm:$idx)>; + def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v4i16 V64:$Rn), +(v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), +(LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; + def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v4f16 V64:$Rn), +(v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), +(LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; + def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v4i16 V64:$Rn), +(v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), +(LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm, VectorIndexH32b_timm:$idx)>; + def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v4f16 V64:$Rn), +(v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), +(LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm, VectorIndexH32b_timm:$idx)>; + def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v8i16 V128:$Rn), +(v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), +(LUT2_H V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; + def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v8f16 V128:$Rn), +(v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))), +(LUT2_H V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexH32b_timm:$idx)>; + def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v8i16 VecListOne8h:$Rn), +(v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), +(LUT2_H VecListOne8h:$Rn, V128:$Rm, VectorIndexH32b_timm:$idx)>; + def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v8f16 VecListOne8h:$Rn), +(v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))), +(LUT2_H VecListOne8h:$Rn, V128:$Rm, VectorIndexH32b_timm:$idx)>; + + def : Pat<(v16i8 (int_aarch64_neon_vluti4q_laneq (v16i8 VecListOne16b:$Rn), +(v16i8 V128:$Rm), (i32 VectorIndexD32b_timm:$idx))), +(LUT4_B VecListOne16b:$Rn, V128:$Rm, VectorIndexD32b_timm:$idx)>; + + def : Pat<(v8i16 (int_aarch64_neon_vluti4q_laneq_x2 (v8i16 VecListOne8h:$Rn1), +(v8i16 VecListOne8h:$Rn2), (v16i8 V128:$Rm), +(i32 VectorIndexS32b_timm:$idx))), +(LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>; + def : Pat<(v8f16 (int_aarch64_neon_vluti4q_laneq_x2 (v8f16 VecListOne8h:$Rn1), +(v8f16 VecListOne8h:$Rn2), (v16i8 V128:$Rm), +(i32 VectorIndexS32b_timm:$idx))), +(LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>; +} + +let Predicates = [HasLUT, HasBF16] in { davemgreen wrote: I think you can make this HasLUT only without needing HasBF16, like the fp16 versions above. Unless that doesn't work? It should only really be dependent on the size of the register (and HasLUT, obviously). You might be able to make a multiclass too for the Pats with a parameter for the type, if they could shares a lot of the same code. https://github.com/llvm/llvm-project/pull/96883 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][NEON] Add intrinsics for LUTI (PR #96883)
@@ -2096,3 +2096,19 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "r def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">; def VSTL1_LANE : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">; } + +//Lookup table read with 2-bit/4-bit indices davemgreen wrote: // Lookup https://github.com/llvm/llvm-project/pull/96883 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [Clang] Bring initFeatureMap back to AArch64TargetInfo. (PR #96832)
davemgreen wrote: Could you explain more about what broke? Are you using target(..) attributes? https://github.com/llvm/llvm-project/pull/96832 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] b635d69 - [NFC] Fix laod -> load typos. NFC
Author: David Green Date: 2024-06-21T09:26:44+01:00 New Revision: b635d690ed1e3fbebab9dee1b157fa380d3e9eba URL: https://github.com/llvm/llvm-project/commit/b635d690ed1e3fbebab9dee1b157fa380d3e9eba DIFF: https://github.com/llvm/llvm-project/commit/b635d690ed1e3fbebab9dee1b157fa380d3e9eba.diff LOG: [NFC] Fix laod -> load typos. NFC Added: Modified: clang/lib/Sema/SemaDeclCXX.cpp clang/test/SemaHLSL/ScalarOverloadResolution.hlsl clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll llvm/test/CodeGen/X86/load-partial-dot-product.ll llvm/test/tools/yaml2obj/COFF/load-config.yaml Removed: diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index d38700d56e4ff..c1189e6935dc9 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -16111,7 +16111,7 @@ ExprResult Sema::BuildCXXConstructExpr( CXXConstructionKind ConstructKind, SourceRange ParenRange) { if (auto *Shadow = dyn_cast(FoundDecl)) { Constructor = findInheritingConstructor(ConstructLoc, Constructor, Shadow); -// The only way to get here is if we did overlaod resolution to find the +// The only way to get here is if we did overload resolution to find the // shadow decl, so we don't need to worry about re-checking the trailing // requires clause. if (DiagnoseUseOfOverloadedDecl(Constructor, ConstructLoc)) diff --git a/clang/test/SemaHLSL/ScalarOverloadResolution.hlsl b/clang/test/SemaHLSL/ScalarOverloadResolution.hlsl index 41702ef175320..d1a47af228e24 100644 --- a/clang/test/SemaHLSL/ScalarOverloadResolution.hlsl +++ b/clang/test/SemaHLSL/ScalarOverloadResolution.hlsl @@ -72,7 +72,7 @@ void Case1(half H, float F, double D) { HalfFloatDouble(D); } -// Case 2: A function declared with double and float overlaods. +// Case 2: A function declared with double and float overloads. // (a) When called with half, it will resolve to float because float is lower // ranked than double. // (b) When called with float it will resolve to float because float is an diff --git a/clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl b/clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl index 12575084ead2b..bbf8d3b5e102c 100644 --- a/clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl +++ b/clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl @@ -71,7 +71,7 @@ void Case1(half2 H, float2 F, double2 D) { HalfFloatDouble(D); } -// Case 2: A function declared with double and float overlaods. +// Case 2: A function declared with double and float overloads. // (a) When called with half, it will resolve to float because float is lower // ranked than double. // (b) When called with float it will resolve to float because float is an diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 46d44704af5a7..cfe9f33efc91b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -100,7 +100,7 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner { bool matchRemoveFcanonicalize(MachineInstr , Register ) const; // Combine unsigned buffer load and signed extension instructions to generate - // signed buffer laod instructions. + // signed buffer load instructions. bool matchCombineSignExtendInReg( MachineInstr , std::pair ) const; void applyCombineSignExtendInReg( diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 471c7ca4d7356..4e515e05c842a 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -57,7 +57,7 @@ // // base = gep a, 0, x, y // load base -// laod base + 1 * sizeof(float) +// load base + 1 * sizeof(float) // load base + 32 * sizeof(float) // load base + 33 * sizeof(float) // diff --git a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll index 03b41db2291a2..6a95859c7692d 100644 --- a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll +++ b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll @@ -22,7 +22,7 @@ define double @ld_double(ptr %p) speculative_load_hardening { entry: %0 = load double, ptr %p, align 8 ret double %0 -; Checking that the address laoded from is masked for a floating point load. +; Checking that the address loaded from is masked for a floating point load. ; CHECK-LABEL: ld_double ; CHECK: cmp sp, #0 ; CHECK-NEXT: csetm x16, ne @@ -43,7 +43,7 @@ entry: %iszero = icmp eq
[clang] [llvm] [AArch64] Add ability to list extensions enabled for a target (PR #95805)
@@ -19,3 +19,19 @@ // RUN: %clang --target=arm64 -mlittle-endian -march=armv8.1a -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-GENERICV81A %s // RUN: %clang --target=arm64 -mlittle-endian -march=armv8.1-a -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-GENERICV81A %s // ARM64-GENERICV81A: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "generic"{{.*}} "-target-feature" "+v8.1a"{{.*}} "-target-feature" "+neon" + +// = Architecture extensions = + +// RUN: %clang -target aarch64 -march=armv8.1-a --print-enabled-extensions 2>&1 | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s +// ARCH-EXTENSION: FEAT_ETE +// ARCH-EXTENSION: FEAT_LOR +// ARCH-EXTENSION: FEAT_TRBE +// ARCH-EXTENSION: FEAT_VHE +// ARCH-EXTENSION: FEAT_PAN +// ARCH-EXTENSION: FEAT_CRC32 +// FIXME: FEAT_FP is optional from v8.0a +// ARCH-EXTENSION: FEAT_FP +// ARCH-EXTENSION: FEAT_LSE +// ARCH-EXTENSION: FEAT_RDM +// FIXME: FEAT_AdvSIMD is optional from v8.0a davemgreen wrote: There is a difference between what the Arm architecture technically describes as optional and what the platform enables by default (i.e. what we enable by default for the compiler in Armv8-a). We don't want everyone in the world to lose out on Neon vectorization and require soft-fp just because it is technically optional in the architecture. The same idea has been applied to SVE2 in Armv9 (although that one has become more debatable). Some of the other optional extensions seem like they might be system-register extensions, I imagine that might be why they are enabled. There might be a better way to handle that. So I'm not sure if there is anything that needs to be "fixed" here, and think I would recommend removing them (especially from fp/asimd, the others are more debatable and could stay if you think they are useful). https://github.com/llvm/llvm-project/pull/95805 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add ability to list extensions enabled for a target (PR #95805)
@@ -140,89 +152,480 @@ def FeatureAES : Extension< // compatibility, and now imply features SHA2 and AES, which was the // "traditional" meaning of Crypto. let FMVDependencies = "+aes,+sha2" in -def FeatureCrypto : Extension<"crypto", "Crypto", +def FeatureCrypto : ExtensionWithMArch<"crypto", "Crypto", "FEAT_Crypto", "Enable cryptographic instructions", [FeatureNEON, FeatureSHA2, FeatureAES]>; -def FeatureCRC : Extension<"crc", "CRC", - "Enable ARMv8 CRC-32 checksum instructions (FEAT_CRC32)", [], +def FeatureCRC : ExtensionWithMArch<"crc", "CRC", "FEAT_CRC32", + "Enable ARMv8 CRC-32 checksum instructions", [], "FEAT_CRC", "+crc", 110>; -def FeatureRAS : Extension<"ras", "RAS", - "Enable ARMv8 Reliability, Availability and Serviceability Extensions (FEAT_RAS, FEAT_RASv1p1)">; - -def FeatureRASv2 : Extension<"rasv2", "RASv2", - "Enable ARMv8.9-A Reliability, Availability and Serviceability Extensions (FEAT_RASv2)", - [FeatureRAS]>; - -def FeatureLSE : Extension<"lse", "LSE", - "Enable ARMv8.1 Large System Extension (LSE) atomic instructions (FEAT_LSE)", [], - "FEAT_LSE", "+lse", 80>; +// This SubtargetFeature is special. It controls only whether codegen will turn +// `llvm.readcyclecounter()` into an access to a PMUv3 System Register. The +// `FEAT_PMUv3*` system registers are always available for assembly/disassembly. +let MArchName = "pmuv3" in +def FeaturePerfMon : ExtensionWithMArch<"perfmon", "PerfMon", "FEAT_PMUv3", + "Enable Code Generation for ARMv8 PMUv3 Performance Monitors extension">; -def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true", - "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules (FEAT_LSE2)">; +def FeatureSpecRestrict : Extension<"specrestrict", "SpecRestrict", "FEAT_CSV2_2", + "Enable architectural speculation restriction">; -def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true", - "Enable out of line atomics to support LSE instructions">; +//===--===// +// Armv8.1 Architecture Extensions +//===--===// -def FeatureFMV : SubtargetFeature<"fmv", "HasFMV", "true", - "Enable Function Multi Versioning support.">; +def FeatureLSE : ExtensionWithMArch<"lse", "LSE", "FEAT_LSE", + "Enable ARMv8.1 Large System Extension (LSE) atomic instructions", [], + "FEAT_LSE", "+lse", 80>; let MArchAlias = "rdma" in -def FeatureRDM : Extension<"rdm", "RDM", - "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions (FEAT_RDM)", +def FeatureRDM : ExtensionWithMArch<"rdm", "RDM", "FEAT_RDM", + "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions", [FeatureNEON], "FEAT_RDM", "+rdm,+fp-armv8,+neon", 108>; -def FeaturePAN : SubtargetFeature< -"pan", "HasPAN", "true", -"Enables ARM v8.1 Privileged Access-Never extension (FEAT_PAN)">; +def FeaturePAN : Extension<"pan", "PAN", "FEAT_PAN", + "Enables ARM v8.1 Privileged Access-Never extension">; -def FeatureLOR : SubtargetFeature< -"lor", "HasLOR", "true", -"Enables ARM v8.1 Limited Ordering Regions extension (FEAT_LOR)">; +def FeatureLOR : Extension<"lor", "LOR", "FEAT_LOR", + "Enables ARM v8.1 Limited Ordering Regions extension">; def FeatureCONTEXTIDREL2 : SubtargetFeature<"CONTEXTIDREL2", "HasCONTEXTIDREL2", "true", "Enable RW operand CONTEXTIDR_EL2" >; -def FeatureVH : SubtargetFeature<"vh", "HasVH", "true", -"Enables ARM v8.1 Virtual Host extension (FEAT_VHE)", [FeatureCONTEXTIDREL2] >; +def FeatureVH : Extension<"vh", "VH", "FEAT_VHE", + "Enables ARM v8.1 Virtual Host extension", [FeatureCONTEXTIDREL2] >; -// This SubtargetFeature is special. It controls only whether codegen will turn -// `llvm.readcyclecounter()` into an access to a PMUv3 System Register. The -// `FEAT_PMUv3*` system registers are always available for assembly/disassembly. -let MArchName = "pmuv3" in -def FeaturePerfMon : Extension<"perfmon", "PerfMon", - "Enable Code Generation for ARMv8 PMUv3 Performance Monitors extension (FEAT_PMUv3)">; +//===--===// +// Armv8.2 Architecture Extensions +//===--===// + +def FeatureSM4 : ExtensionWithMArch<"sm4", "SM4", "FEAT_SM4, FEAT_SM3", + "Enable SM3 and SM4 support", [FeatureNEON], + "FEAT_SM4", "+sm4,+fp-armv8,+neon", 106>; + +def FeatureSHA3 : ExtensionWithMArch<"sha3", "SHA3", "FEAT_SHA3, FEAT_SHA512", + "Enable SHA512 and SHA3 support", [FeatureNEON, FeatureSHA2], + "FEAT_SHA3", "+sha3,+sha2,+fp-armv8,+neon", 140>; + +def FeatureRAS : ExtensionWithMArch<"ras", "RAS", "FEAT_RAS, FEAT_RASv1p1", + "Enable ARMv8 Reliability, Availability and Serviceability Extensions">; let ArchExtKindSpelling = "AEK_FP16", MArchName = "fp16" in -def FeatureFullFP16 : Extension<"fullfp16",
[clang] [llvm] [AArch64] Add ability to list extensions enabled for a target (PR #95805)
@@ -19,3 +19,19 @@ // RUN: %clang --target=arm64 -mlittle-endian -march=armv8.1a -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-GENERICV81A %s // RUN: %clang --target=arm64 -mlittle-endian -march=armv8.1-a -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-GENERICV81A %s // ARM64-GENERICV81A: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "generic"{{.*}} "-target-feature" "+v8.1a"{{.*}} "-target-feature" "+neon" + +// = Architecture extensions = + +// RUN: %clang -target aarch64 -march=armv8.1-a --print-enabled-extensions 2>&1 | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s +// ARCH-EXTENSION: FEAT_ETE +// ARCH-EXTENSION: FEAT_LOR +// ARCH-EXTENSION: FEAT_TRBE +// ARCH-EXTENSION: FEAT_VHE +// ARCH-EXTENSION: FEAT_PAN +// ARCH-EXTENSION: FEAT_CRC32 +// FIXME: FEAT_FP is optional from v8.0a +// ARCH-EXTENSION: FEAT_FP +// ARCH-EXTENSION: FEAT_LSE +// ARCH-EXTENSION: FEAT_RDM +// FIXME: FEAT_AdvSIMD is optional from v8.0a davemgreen wrote: Why do these have FIXMEs? https://github.com/llvm/llvm-project/pull/95805 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang] Reland Add tanf16 builtin and support for tan constrained intrinsic (PR #94559)
davemgreen wrote: I believe they were added so long ago that the default Expanding wasn't done at the time. @efriedma-quic do you have more of an idea than that? https://github.com/llvm/llvm-project/pull/94559 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang] Reland Add tanf16 builtin and support for tan constrained intrinsic (PR #94559)
davemgreen wrote: Usually when new ISD nodes are added they are expanded for all types, so that every backend will get at least working code even if it is not optimal. The targets can then come along and override the defaults for the types they are interested in, to get better results. For tan I would expect most vector types would want to scalarize, so marking them as expand would make sense. If more types than are necessary get marked as Expand that shouldn't be an issue, it looks like we already do that for a number of other nodes. https://github.com/llvm/llvm-project/pull/94559 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang] Reland Add tanf16 builtin and support for tan constrained intrinsic (PR #94559)
davemgreen wrote: If you remove tan from isTriviallyVectorizable it should prevent vectorization in the short term. It might be better to default FTAN to expand in https://github.com/llvm/llvm-project/blob/64c9a1e1266ec7bc4c4896b2df116fa12dbacf15/llvm/lib/CodeGen/TargetLoweringBase.cpp#L960, which seems to only be done for f32/f64/f128 at the moment. https://github.com/llvm/llvm-project/pull/94559 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Cortex-A725 and Cortex-X925 (PR #95214)
https://github.com/davemgreen approved this pull request. Thanks! https://github.com/llvm/llvm-project/pull/95214 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Cortex-A725 and Cortex-X925 (PR #95214)
@@ -723,6 +746,9 @@ def ProcessorFeatures { FeaturePerfMon, FeatureETE, FeatureTRBE, FeatureSPE, FeatureMTE, FeatureSVE2BitPerm, FeatureFP16FML, FeatureSPE_EEF]; + list X925 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML, davemgreen wrote: Should this include FeatureSVE2BitPerm? It is included in the AEK_ list, and X4 the features. Same for the A725 features. https://github.com/llvm/llvm-project/pull/95214 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [ARM] Add support for Cortex-R52+ (PR #94633)
@@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52 | FileCheck %s --check-prefix=CHECK --check-prefix=USEAA +; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52plus | FileCheck %s --check-prefix=CHECK --check-prefix=USEAA davemgreen wrote: Adding these run lines for some of these tests may not be necessary for both r52 and r52+. I think I would drop it from the scheduling/useaa tests if it just uses the same scheduling model. https://github.com/llvm/llvm-project/pull/94633 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [ARM] Add support for Cortex-R52+ (PR #94633)
@@ -90,6 +90,8 @@ def ProcR7 : SubtargetFeature<"r7", "ARMProcFamily", "CortexR7", "Cortex-R7 ARM processors", []>; def ProcR52 : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52", "Cortex-R52 ARM processors", []>; +def ProcR52plus : SubtargetFeature<"r52plus", "ARMProcFamily", "CortexR52plus", davemgreen wrote: This could maybe just use ProcR52 https://github.com/llvm/llvm-project/pull/94633 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [ARM] Add support for Cortex-R52+ (PR #94633)
https://github.com/davemgreen approved this pull request. LGTM, thanks https://github.com/llvm/llvm-project/pull/94633 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [ARM] Add support for Cortex-R52+ (PR #94633)
https://github.com/davemgreen edited https://github.com/llvm/llvm-project/pull/94633 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Decouple feature dependency expansion. (PR #94279)
davemgreen wrote: Yeah I had just seen that error message before you edited your comment. There are some examples of neon I found in a quick search, which were presumably added for AArch32: https://github.com/aaru-dps/Aaru.Checksums.Native/blob/bd5051ce181b225a7662bfb764ebcc5cbe7542b2/simd.h#L112 https://github.com/mooch443/commons/blob/30dc797430968831959d77d7f2503cec3518a13a/common/misc/PVBlob.cpp#L385 I'm not sure if that is reason enough to still support it. But like I said, if I try this patch locally then `target("neon")` seems to be accepted fine (no errors). It is the same for other features like `target("fullfp16")`, which seem to enable `+fullfp16` in the backend. `"noneon"` is no longer accepted, which might be fine as I don't believe negative features are commonly used. (For aarch64 from a baseline of armv8 they are mostly additive. They are likely to become more common going forward but new users can use the "right" attribute names). https://github.com/llvm/llvm-project/pull/94279 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Decouple feature dependency expansion. (PR #94279)
https://github.com/davemgreen commented: > LGTM. The main change to point out is that the target attribute will no > longer accept internal feature names. I don't think it should ever have done > so, but we should get input from others. @davemgreen? There are references to > existing code in [D137617](https://reviews.llvm.org/D137617) but no details. > If this has been used for e.g. intrinsics definitions, I am surprised there > are not more test failures. Hi - It was intentional to support older versions of clang. The target attributes already had users before I fixed them to support the same formats as GCC for AArch64, and was aiming at not breaking the existing code. IIRC There are quite a few uses of things like `target("crypto")` out there (without the + that gcc wants to include). I'm not sure if that extends to internal feature names a lot. Not supporting "neon" as a name would seem like a mistake if it was removed, but I don't believe this patch does that. If it only effects negative features those have never worked particularly well. https://github.com/llvm/llvm-project/pull/94279 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Cortex-A725 and Cortex-X925 (PR #93978)
@@ -863,6 +889,8 @@ def : ProcessorModel<"cortex-a720", NeoverseN2Model, ProcessorFeatures.A720, [TuneA720]>; def : ProcessorModel<"cortex-a720ae", NeoverseN2Model, ProcessorFeatures.A720AE, [TuneA720AE]>; +def : ProcessorModel<"cortex-a725", CortexA57Model, ProcessorFeatures.A725, davemgreen wrote: NeoverseN2Model https://github.com/llvm/llvm-project/pull/93978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Cortex-A725 and Cortex-X925 (PR #93978)
@@ -877,6 +905,8 @@ def : ProcessorModel<"cortex-x3", NeoverseN2Model, ProcessorFeatures.X3, [TuneX3]>; def : ProcessorModel<"cortex-x4", NeoverseN2Model, ProcessorFeatures.X4, [TuneX4]>; +def : ProcessorModel<"cortex-x925", NeoverseN2Model, ProcessorFeatures.X925, davemgreen wrote: Maybe NeoverseV2Model, as the best fit even if it is quite different? (Some of the older cortex-x cores look like they could be using a newer model too, but that is a different issue). https://github.com/llvm/llvm-project/pull/93978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)
@@ -189,15 +189,15 @@ define i32 @shr(i32 %a, i32 %b) { define i1 @outer_and1(i1 %a) { -; check-label: @outer_and1( -; check-not: call i1 @and1 +; check-LABEL: @outer_and1( davemgreen wrote: I've regenerated the check lines in 220756f1f92b335cbafdff67c570d096a6925d87. https://github.com/llvm/llvm-project/pull/91854 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)
@@ -121,7 +121,7 @@ define i32 @test_orr_extract_from_mul_1(i32 %x, i32 %y) { ; CHECK-THUMB-NEXT:orrs r0, r1 ; CHECK-THUMB-NEXT:bx lr entry: -; CHECk-THUMB: orrs r0, r1 davemgreen wrote: I believe we can delete this line, it was left in from the old checks. https://github.com/llvm/llvm-project/pull/91854 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)
@@ -189,15 +189,15 @@ define i32 @shr(i32 %a, i32 %b) { define i1 @outer_and1(i1 %a) { -; check-label: @outer_and1( -; check-not: call i1 @and1 +; check-LABEL: @outer_and1( davemgreen wrote: Should all these be "CHECK"? https://github.com/llvm/llvm-project/pull/91854 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)
@@ -217,42 +217,42 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) { } define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) { -; check-label: volatile_load_v3i8_to_4xi32: +; check-LABEL: volatile_load_v3i8_to_4xi32: davemgreen wrote: I think we can delete these. The real check lines are below. https://github.com/llvm/llvm-project/pull/91854 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)
@@ -22,7 +22,7 @@ define signext i8 @test1(i32 %A) { ; CHECK-V7: @ %bb.0: ; CHECK-V7-NEXT:sbfx r0, r0, #8, #8 ; CHECK-V7-NEXT:bx lr -; CHECk-V7: sbfx r0, r0, #8, #8 +; CHECK-V7: sbfx r0, r0, #8, #8 davemgreen wrote: I believe we can delete this line, it was left in from the old checks. https://github.com/llvm/llvm-project/pull/91854 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)
https://github.com/davemgreen commented: The Arm/AArch64 tests looks OK for the most part. I might be able to help with some of them if that is easier than trying to sort them all here. https://github.com/llvm/llvm-project/pull/91854 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)
https://github.com/davemgreen edited https://github.com/llvm/llvm-project/pull/91854 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [lldb] [llvm] [AArch64][TargetParser] autogen ArchExtKind enum - renaming (PR #90320)
davemgreen wrote: Rust (https://github.com/rust-lang/rust/blob/79734f1db8dbe322192dea32c0f6b80ab14c4c1d/compiler/rustc_codegen_llvm/src/llvm_util.rs#L229) and zig (https://github.com/ziglang/zig/blob/44db92d1ca90c9cfdfb29fe46f04ff8f11c80901/lib/std/Target/aarch64.zig#L43) are two examples of what I meant by external dependencies. They can adapt, especially if there are release notes, but there may be many more projects out there using the old names and it would be good if they kept working, if we can. https://github.com/llvm/llvm-project/pull/90320 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [lldb] [llvm] [AArch64][TargetParser] autogen ArchExtKind enum - renaming (PR #90320)
davemgreen wrote: @tmatheson-arm reached out and we have a bit of a conversation internally. I do think that there is too much going on in this one pr to be sensible to review, but from what I've looked at my main points I think are: - Some AEK names get renamed in ways I would not expect them to, like AEK_FP16 being changed to AEK_FULLFP16. The command-line names or FEAT_ names should probably be what we are aiming for if we are changing them one-way or the other. - Some of the backend features have been renamed in ways that could cause breakages for external users, like +complxnum becoming +fcma. The new name is better, but ideally the old name would continue to work (maybe by adding an alias/extra target feature dependant on the new name? That might not work with negative features). - Some of changes do look mechanical and a good change (P1->p1, V2->v2 etc), and if they were separate they would be easy to get in and out of the way of the contentious stuff that remains. - The ones that have changed should have release notes. https://github.com/llvm/llvm-project/pull/90320 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [ARM] Armv8-R does not require fp64 or neon. (PR #88287)
davemgreen wrote: > which change? Specifying -mcpu=cortex-r52 will behave the same way as before. > The original manual for the R52 provided for a no-neon sp-only variant, and > they exist in the wild, and this lets "architecture-generic" builds > automatically support both. I just meant the break in functionality for existing users of -march=armv8r, and the drop in performance from less R52 tuning features/scheduling and the lack of Neon. I believe this is more correct, but users in the past could still get the same results by specifying `-march=armv8r -mfpu=fpv5-sp-d16`. I'm hoping someone else can come along and have an opinion on it one way or another. https://github.com/llvm/llvm-project/pull/88287 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [lldb] [llvm] [AArch64][TargetParser] autogen ArchExtKind enum - renaming (PR #90320)
davemgreen wrote: > This is already split into 18 commits, I don't think there's any reason to > split it into 18 PRs, since comments on one of them likely apply to the > others. I disagree. This is going to be awkward for a lot of users of llvm and contains at least some details I don't agree with. I think it will can cause a lot of subtle bugs and can end up wasting a lot of peoples time. It should at least be awkward for us too. https://github.com/llvm/llvm-project/pull/90320 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [lldb] [llvm] [AArch64][TargetParser] autogen ArchExtKind enum - renaming (PR #90320)
davemgreen wrote: IMO This patch looks far too large to sensibly review and needs to be split up. A lot of the changes don't really looks like mechanical renamings, and it is hard to see how they would not break existing uses of llvm arch64 target features? https://github.com/llvm/llvm-project/pull/90320 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] Reapply "[Clang][Sema] Diagnose class member access expressions naming non-existent members of the current instantiation prior to instantiation in the absence of dependent
davemgreen wrote: Hi - We've ran into a couple of places where this causes problems, one of them in running Spec as above. Is it possible to turn off this error for older codebases with a flag, turning it into a warning? It doesn't seem like a very useful error if it applies to code that is never used. https://godbolt.org/z/Pf83EW7vE https://github.com/llvm/llvm-project/pull/90152 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Cortex-R82AE and improve Cortex-R82 (PR #90440)
https://github.com/davemgreen approved this pull request. Thanks. LGTM https://github.com/llvm/llvm-project/pull/90440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Cortex-R82AE and improve Cortex-R82 (PR #90440)
@@ -632,7 +632,18 @@ inline constexpr CpuInfo CpuInfos[] = { AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM, AArch64::AEK_PERFMON, AArch64::AEK_PREDRES, AArch64::AEK_PROFILE})}, -{"cortex-r82", ARMV8R, AArch64::ExtensionBitset({AArch64::AEK_LSE})}, +{"cortex-r82", ARMV8R, + AArch64::ExtensionBitset({AArch64::AEK_CRC, AArch64::AEK_DOTPROD, davemgreen wrote: Can you check if ARMV8R enables AEK_LSE? There is a flip(), which might be turning it off? https://github.com/llvm/llvm-project/pull/90440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Cortex-R82AE and improve Cortex-R82 (PR #90440)
@@ -143,6 +143,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { case CortexA78AE: case CortexA78C: case CortexR82: + case CortexR82AE: davemgreen wrote: Can you move both of these into the block with CortexA55? It has always been in the wrong place compared to the other cpus around it and the alignments it sets. https://github.com/llvm/llvm-project/pull/90440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Cortex-R82AE and improve Cortex-R82 (PR #90440)
@@ -632,7 +632,18 @@ inline constexpr CpuInfo CpuInfos[] = { AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM, AArch64::AEK_PERFMON, AArch64::AEK_PREDRES, AArch64::AEK_PROFILE})}, -{"cortex-r82", ARMV8R, AArch64::ExtensionBitset({AArch64::AEK_LSE})}, +{"cortex-r82", ARMV8R, + AArch64::ExtensionBitset({AArch64::AEK_CRC, AArch64::AEK_DOTPROD, davemgreen wrote: Are these implied by ARMV8R? Is there an advantage to having them specified separately too? https://github.com/llvm/llvm-project/pull/90440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Neoverse-N3, Neoverse-V3 and Neoverse-V3AE (PR #90143)
https://github.com/davemgreen approved this pull request. Thanks. I didn't check the enabled features but the tunings look good to me. https://github.com/llvm/llvm-project/pull/90143 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Neoverse-N3, Neoverse-V3 and Neoverse-V3AE (PR #90143)
@@ -447,6 +447,16 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2 FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; +def TuneNeoverseN3 : SubtargetFeature<"neoversen3", "ARMProcFamily", "NeoverseN3", + "Neoverse N3 ARM processors", [ + FeatureFuseAES, + FeaturePostRAScheduler, + FeatureCmpBccFusion, davemgreen wrote: Hi - Should FeatureCmpBccFusion be enabled over what is in N2? The core might well be able to fuse them, but I don't think that is new and from what I remember the performance was sometimes worse with it enabled. (I think llvm's implementation of FeatureCmpBccFusion might be a bit more aggressive than is helpful). https://github.com/llvm/llvm-project/pull/90143 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [ARM] Armv8-R does not require fp64 or neon. (PR #88287)
davemgreen wrote: I'm not sure I would make this change, mostly due to it potentially causing a break for existing users and making performance worse, but can see the reasoning. I am willing to defer to others if they have an opinion. https://github.com/llvm/llvm-project/pull/88287 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [ARM] Armv8-R does not require fp64 or neon. (PR #88287)
davemgreen wrote: As far as I understand this will remove the tuning we do for cortex-r52 when using armv8r, which will mean a little less performance but the tuning features in the Arm backend are not handled as well as they could be. Can you add release note explaining what will change? Thanks. https://github.com/llvm/llvm-project/pull/88287 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [ARM] Armv8-R does not require fp64 or neon. (PR #88287)
davemgreen wrote: Does this disable neon by default for cortex-r52? If so I don't think it should be doing that, it would be a break in the existing behaviour, and should at least be in the release notes. The general rule for -mcpu options is that they should (roughly) enable the maximum set of features available, and from there can be disabled with -mfpu or +nofp options. Essentially the default is -mfpu=auto. Keeping to this keeps things consistent between cores, so long as people understand the rule. For architectures the default should be without, but as far as I understand we still default to -mfpu=auto. https://github.com/llvm/llvm-project/pull/88287 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Cortex-A520AE and Cortex-A720AE CPUs (PR #85401)
https://github.com/davemgreen approved this pull request. Thanks. LGTM https://github.com/llvm/llvm-project/pull/85401 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Cortex-A520AE and Cortex-A720AE CPUs (PR #85401)
@@ -58,6 +58,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { CortexA55, CortexA510, CortexA520, +CortexA520AE, davemgreen wrote: These might not be worth adding, considering they should be the same as CortexA520, and could reuse the same enum. https://github.com/llvm/llvm-project/pull/85401 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64] Add support for Cortex-A520AE and Cortex-A720AE CPUs (PR #85401)
@@ -67,6 +67,8 @@ Changes to Interprocedural Optimizations Changes to the AArch64 Backend -- +* Added support for Cortex-A520AE and Cortex-A720AE CPUs. davemgreen wrote: Could this have Cortex-A78AE too? https://github.com/llvm/llvm-project/pull/85401 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][AArch64] Enable fp128 for aarch64 linux target (PR #85070)
https://github.com/davemgreen commented: Hi - I think this looks sensible, considering that long double == fp128. Should we be doing the same for other OS's in this file too? https://github.com/llvm/llvm-project/pull/85070 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)
davemgreen wrote: I would also personally add Armv6 too for consistency, but don't have a strong opinion. https://github.com/llvm/llvm-project/pull/82400 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)
@@ -305,6 +305,17 @@ X86 Support Arm and AArch64 Support ^^^ +- ARMv7+ targets now default to allowing unaligned access, except Armv6-M, and + Armv8-M without the Main Extension. Baremetal targets should check that the + new default will work with their system configurations, since it requires + that SCTLR.A is 0, SCTLR.U is 1, and that the memory in question is + configured as "normal" memory. We've made the value judgment that the + performance gains here outweigh breakages, since it is difficult to identify + performance loss from disabling unaligned access, but incorrect enabling + unaligned access will generate an obvious alignment fault on ARMv7+. This is + also the default setting for ARM's downstream compilers. We have not changed + the default behavior for ARMv6, but may revisit that decision in the future. davemgreen wrote: I would up-play the compatibility argument and downplay the judgement call a little. And mention the way to disable it. Maybe something like ``` - ARMv7+ targets now default to allowing unaligned access, except Armv6-M, and Armv8-M without the Main Extension. Baremetal targets should check that the new default will work with their system configurations, since it requires that SCTLR.A is 0, SCTLR.U is 1, and that the memory in question is configured as "normal" memory. This brings clang in-line with the default settings for GCC and Arm Compiler. The old behaviour can be restored with -mno-unaligned-access. ``` But you might want to re-add the reasoning about the performance/codesize loss. https://github.com/llvm/llvm-project/pull/82400 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)
@@ -22,6 +22,12 @@ // RUN: %clang -target armv7-windows -### %s 2> %t // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-ARM < %t %s +// RUN: %clang --target=armv6 -### %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-ALIGNED-ARM < %t %s + +// RUN: %clang --target=armv7 -### %s 2> %t davemgreen wrote: Can you add some extra tests for things like 8-m.main, 8-m.base and 6-m, to make sure we have coverage? https://github.com/llvm/llvm-project/pull/82400 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)
https://github.com/davemgreen edited https://github.com/llvm/llvm-project/pull/82400 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)
https://github.com/davemgreen commented: > Unaligned accesses require that SCTLR.A is 0, SCTLR.U is 1, and that the > memory in question is configured as "normal" memory. Almost all operating > systems do in fact configure their registers/memory this way, but on > baremetal it's not really a safe assumption. Changing the default here is > basically guaranteed to break someone's code. > > We could make the value judgement that the performance gain outweighs > breaking someone's code. Disabled unaligned access is a performance hit users > have a hard time discovering; incorrectly enabled unaligned access will > generate an obvious alignment fault on v7 and up. > > On pre-v7 processors, you can get the old "rotate" behavior instead of a > fault. This is controlled by SCTLR.U on v6, but I don't think there's any > reason to expect the bit is configured the "right" way on baremetal. So > changing the default for v6 is a bit more dubious. > > The tradeoffs might be a bit different for M-class processors; I think > unaligned access works by default there (except for v6m and v8m.baseline). > > This change needs a release note. The issue that we have found is that as both GCC and Arm Compiler default to -munaligned-access, users expect it to be the default. They only notice the much bigger codesize/worse performance and don't understand the reason without a lot of digging. You are certainly right that someone who has been using clang bare metal in the past might hit problems with the new default, but there is a high chance they are just using the wrong option without noticing. And I believe aligning with GCC/Arm Compiler is a better default going forward, as more people start using Clang in bare metal. Hopefully the release note can at least make it clear. M-Profile needs a bit set in the bootcode, IIRC. https://github.com/llvm/llvm-project/pull/82400 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)
@@ -895,19 +895,17 @@ llvm::ARM::FPUKind arm::getARMTargetFeatures(const Driver , // defaults this bit to 0 and handles it as a system-wide (not // per-process) setting. It is therefore safe to assume that ARMv7+ // Linux targets support unaligned accesses. The same goes for NaCl -// and Windows. -// -// The above behavior is consistent with GCC. +// and Windows. However, ARM's forks of GCC and Clang both allow +// unaligned accesses by default for all targets. We follow this +// behavior and enable unaligned accesses by default for ARMv7+ targets. +// Users can disable behavior via compiler options (-mno-unaliged-access). +// See https://github.com/llvm/llvm-project/issues/59560 for more info. davemgreen wrote: Also, I would drop the reference to the github issue from the comment. People can inspect the git history if they need to go looking for a reason behind it. https://github.com/llvm/llvm-project/pull/82400 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)
davemgreen wrote: Hi - I like the change. We have this code in the downstream compiler, which also enables this for Armv6, but specifically disables it for v6m and v8m.baseline. ``` if (VersionNum < 6 || Triple.getSubArch() == llvm::Triple::SubArchType::ARMSubArch_v6m || Triple.getSubArch() == llvm::Triple::SubArchType::ARMSubArch_v8m_baseline) { Features.push_back("+strict-align"); } ``` I don't have a strong opinion about what happens with ARMv6, but this deserves a release note. https://github.com/llvm/llvm-project/pull/82400 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [ARM] __ARM_ARCH macro definition fix (PR #81493)
davemgreen wrote: I'm a little worried people might be relying on the existing behaviour, with both clang and GCC having this wrong for a while. If we are going to do it can you add a release note to clang explaining the new behaviour? https://github.com/llvm/llvm-project/pull/81493 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [clang] [AArch64] Implement -fno-plt for SelectionDAG/GlobalISel (PR #78890)
@@ -201,17 +201,27 @@ define dso_local void @rv_marker_3() personality ptr @__gxx_personality_v0 { ; GISEL-NEXT:bl _objc_object ; GISEL-NEXT: Ltmp1: ; GISEL-NEXT: ; %bb.1: ; %invoke.cont -; GISEL-NEXT:ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; GISEL-NEXT: Lloh0: +; GISEL-NEXT:adrp x1, _objc_release@GOTPAGE ; GISEL-NEXT:mov x0, x19 +; GISEL-NEXT: Lloh1: +; GISEL-NEXT:ldr x1, [x1, _objc_release@GOTPAGEOFF] +; GISEL-NEXT:ldp x29, x30, [sp, #16] ; 16-byte Folded Reload ; GISEL-NEXT:ldp x20, x19, [sp], #32 ; 16-byte Folded Reload -; GISEL-NEXT:b _objc_release +; GISEL-NEXT:br x1 davemgreen wrote: @fhahn, @TNorthover do these sound OK to you? https://github.com/llvm/llvm-project/pull/78890 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [clang-tools-extra] [compiler-rt] [llvm] [flang] [TTI]Fallback to SingleSrcPermute shuffle kind, if no direct estimation for (PR #79837)
davemgreen wrote: I think this is probably OK for Arm & AArch64. In the long run we should ideally be adding better extract subvector costs, but this patch moves the cost in that direction. https://github.com/llvm/llvm-project/pull/79837 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[flang] [compiler-rt] [clang] [llvm] [mlir] [clang-tools-extra] [lldb] [libc] [libcxx] [AArch64] add intrinsic to generate a bfi instruction (PR #79672)
davemgreen wrote: I see. The issue is that the opposite is often true as well - if we add a target specific intrinsic for this then, whilst we get a single instruction being emitted, we don't see all the other optimizations that the compiler can and should be performing. Things like constant folding, combining into other instructions, known-bits analysis or any form of vectorization will all be blocked by the intrinsic. It can take quite some work to add all those features in (if they are possible), and without them can potentially lead to worse results. Plus more things to maintain. BFI isn't a trivial instructions to match as it involves certain masks and shifts. There might certainly be advantages to having an intrinsic. I would like to try and see what the problems would be with generated code using normal operations first though, if we can. If there are optimizations we can make based on the existing code then that would help in all cases (c, mlir, rust, etc), not just frontends that are producing the intrinsics. https://github.com/llvm/llvm-project/pull/79672 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AArch64][TargetParser] Add mcpu alias for Microsoft Azure Cobalt 100. (PR #79614)
https://github.com/davemgreen approved this pull request. Thanks. LGTM too. https://github.com/llvm/llvm-project/pull/79614 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [compiler-rt] [libc] [flang] [mlir] [libcxx] [lldb] [llvm] [clang] [AArch64] add intrinsic to generate a bfi instruction (PR #79672)
davemgreen wrote: OK. We would not usually add intrinsics like this without a strong motivating case, that could not be optimized in some other way. It is better to use target independent options when available, and inline assembly is available as a fallback if it is really needed. But I would recommend that they use normal and/or/shift operations and let us know about places the compiler isn't optimizing them as well as it could be. https://github.com/llvm/llvm-project/pull/79672 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[libc] [clang-tools-extra] [lldb] [flang] [mlir] [llvm] [clang] [compiler-rt] [libcxx] [AArch64] add intrinsic to generate a bfi instruction (PR #79672)
davemgreen wrote: Hello. Can you explain why this is needed, as opposed to using the equivalent shift/and/ors? https://github.com/llvm/llvm-project/pull/79672 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AArch64][TargetParser] Add mcpu alias for Microsoft Azure Cobalt 100. (PR #79614)
davemgreen wrote: It looks like this needs to update testAArch64CPUArchList too. Otherwise it LGTM https://github.com/llvm/llvm-project/pull/79614 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [TargetParser] Define AEK_FCMA and AEK_JSCVT for tsv110 (PR #75516)
https://github.com/davemgreen approved this pull request. https://github.com/ARM-software/acle/pull/279 was committed recently, where I think this lines up with the final version of it. I think this LGTM in that case. https://github.com/llvm/llvm-project/pull/75516 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [llvm] [clang] [AArch64][SVE2] Lower OR to SLI/SRI (PR #77555)
https://github.com/davemgreen approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/77555 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)
https://github.com/davemgreen approved this pull request. Thanks. LGTM https://github.com/llvm/llvm-project/pull/75440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char
[llvm] [clang] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
https://github.com/davemgreen approved this pull request. Thanks for the updates. From what I can tell this LGTM, but it will need a rebase. You might want to commit it with the option disabled, and then flip the switch in a followup to avoid the commit-revert cycles in case there are any issues. https://github.com/llvm/llvm-project/pull/72273 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] [clang] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
https://github.com/davemgreen edited https://github.com/llvm/llvm-project/pull/72273 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)
davemgreen wrote: If you can make armv9-a work the same as armv8-a and add some tests for it then this LGTM https://github.com/llvm/llvm-project/pull/75440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)
@@ -836,6 +837,70 @@ void ARMTargetInfo::getTargetDefines(const LangOptions , if (Opts.RWPI) Builder.defineMacro("__ARM_RWPI", "1"); + // Macros for enabling co-proc intrinsics + uint64_t FeatureCoprocBF = 0; + switch (ArchKind) { + default: +break; + case llvm::ARM::ArchKind::ARMV4: +// Filter __arm_ldcl and __arm_stcl in acle.h +FeatureCoprocBF = FEATURE_COPROC_B1; +break; + case llvm::ARM::ArchKind::ARM5T: +FeatureCoprocBF = isThumb() ? 0 : FEATURE_COPROC_B1; +break; + case llvm::ARM::ArchKind::ARMV5TE: + case llvm::ARM::ArchKind::ARMV5TEJ: +if (!isThumb()) + FeatureCoprocBF = + FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | FEATURE_COPROC_B3; +break; + case llvm::ARM::ArchKind::ARMV6: + case llvm::ARM::ArchKind::ARMV6K: + case llvm::ARM::ArchKind::ARMV6KZ: + case llvm::ARM::ArchKind::ARMV6T2: +if (!isThumb() || ArchKind == llvm::ARM::ArchKind::ARMV6T2) + FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | +FEATURE_COPROC_B3 | FEATURE_COPROC_B4; +break; + case llvm::ARM::ArchKind::ARMV7A: + case llvm::ARM::ArchKind::ARMV7R: + case llvm::ARM::ArchKind::ARMV7M: + case llvm::ARM::ArchKind::ARMV7S: + case llvm::ARM::ArchKind::ARMV7EM: +FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | + FEATURE_COPROC_B3 | FEATURE_COPROC_B4; +break; + case llvm::ARM::ArchKind::ARMV8A: + case llvm::ARM::ArchKind::ARMV8R: + case llvm::ARM::ArchKind::ARMV8_1A: + case llvm::ARM::ArchKind::ARMV8_2A: + case llvm::ARM::ArchKind::ARMV8_3A: + case llvm::ARM::ArchKind::ARMV8_4A: + case llvm::ARM::ArchKind::ARMV8_5A: + case llvm::ARM::ArchKind::ARMV8_6A: + case llvm::ARM::ArchKind::ARMV8_7A: + case llvm::ARM::ArchKind::ARMV8_8A: + case llvm::ARM::ArchKind::ARMV8_9A: +// Filter __arm_cdp, __arm_ldcl, __arm_stcl in arm_acle.h +FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B3; +break; + case llvm::ARM::ArchKind::ARMV8MMainline: +FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | + FEATURE_COPROC_B3 | FEATURE_COPROC_B4; +break; + case llvm::ARM::ArchKind::ARMV9A: + case llvm::ARM::ArchKind::ARMV9_1A: + case llvm::ARM::ArchKind::ARMV9_2A: + case llvm::ARM::ArchKind::ARMV9_3A: + case llvm::ARM::ArchKind::ARMV9_4A: davemgreen wrote: Oh right, ARMV9_5A is AArch64 only. That's OK then. I would expect the other ArmV9-A cases to be the same as ArmV8-A for AArch32, and wouldn't have expected a change in coprocessor instructions. The reference manual is at https://developer.arm.com/documentation/ddi0487/ja/?lang=en and doesn't seem to mention cdp. https://github.com/llvm/llvm-project/pull/75440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)
https://github.com/davemgreen edited https://github.com/llvm/llvm-project/pull/75440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)
@@ -756,6 +756,58 @@ __arm_st64bv0(void *__addr, data512_t __value) { __builtin_arm_mops_memset_tag(__tagged_address, __value, __size) #endif +/* Coprocessor Intrinsics */ +#if defined(__ARM_FEATURE_COPROC) + +#if (__ARM_FEATURE_COPROC & 0x1) + +#if (__ARM_ARCH != 8) davemgreen wrote: Could this be < 8? This doesn't apply to 8-m.main, right? The test looks OK. https://github.com/llvm/llvm-project/pull/75440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)
@@ -836,6 +837,70 @@ void ARMTargetInfo::getTargetDefines(const LangOptions , if (Opts.RWPI) Builder.defineMacro("__ARM_RWPI", "1"); + // Macros for enabling co-proc intrinsics + uint64_t FeatureCoprocBF = 0; + switch (ArchKind) { + default: +break; + case llvm::ARM::ArchKind::ARMV4: +// Filter __arm_ldcl and __arm_stcl in acle.h +FeatureCoprocBF = FEATURE_COPROC_B1; +break; + case llvm::ARM::ArchKind::ARM5T: +FeatureCoprocBF = isThumb() ? 0 : FEATURE_COPROC_B1; +break; + case llvm::ARM::ArchKind::ARMV5TE: + case llvm::ARM::ArchKind::ARMV5TEJ: +if (!isThumb()) + FeatureCoprocBF = + FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | FEATURE_COPROC_B3; +break; + case llvm::ARM::ArchKind::ARMV6: + case llvm::ARM::ArchKind::ARMV6K: + case llvm::ARM::ArchKind::ARMV6KZ: + case llvm::ARM::ArchKind::ARMV6T2: +if (!isThumb() || ArchKind == llvm::ARM::ArchKind::ARMV6T2) + FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | +FEATURE_COPROC_B3 | FEATURE_COPROC_B4; +break; + case llvm::ARM::ArchKind::ARMV7A: + case llvm::ARM::ArchKind::ARMV7R: + case llvm::ARM::ArchKind::ARMV7M: + case llvm::ARM::ArchKind::ARMV7S: + case llvm::ARM::ArchKind::ARMV7EM: +FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | + FEATURE_COPROC_B3 | FEATURE_COPROC_B4; +break; + case llvm::ARM::ArchKind::ARMV8A: + case llvm::ARM::ArchKind::ARMV8R: + case llvm::ARM::ArchKind::ARMV8_1A: + case llvm::ARM::ArchKind::ARMV8_2A: + case llvm::ARM::ArchKind::ARMV8_3A: + case llvm::ARM::ArchKind::ARMV8_4A: + case llvm::ARM::ArchKind::ARMV8_5A: + case llvm::ARM::ArchKind::ARMV8_6A: + case llvm::ARM::ArchKind::ARMV8_7A: + case llvm::ARM::ArchKind::ARMV8_8A: + case llvm::ARM::ArchKind::ARMV8_9A: +// Filter __arm_cdp, __arm_ldcl, __arm_stcl in arm_acle.h +FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B3; +break; + case llvm::ARM::ArchKind::ARMV8MMainline: +FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | + FEATURE_COPROC_B3 | FEATURE_COPROC_B4; +break; + case llvm::ARM::ArchKind::ARMV9A: + case llvm::ARM::ArchKind::ARMV9_1A: + case llvm::ARM::ArchKind::ARMV9_2A: + case llvm::ARM::ArchKind::ARMV9_3A: + case llvm::ARM::ArchKind::ARMV9_4A: davemgreen wrote: There is a ARMV9_5A now too. I think I would expect these to be the same as ARMV8. Is this switch statement exhaustive? Could the default case be made the same as ARMV8 so we don't need to extend it every time an architecture is added? https://github.com/llvm/llvm-project/pull/75440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)
@@ -836,6 +837,70 @@ void ARMTargetInfo::getTargetDefines(const LangOptions , if (Opts.RWPI) Builder.defineMacro("__ARM_RWPI", "1"); + // Macros for enabling co-proc intrinsics + uint64_t FeatureCoprocBF = 0; + switch (ArchKind) { + default: +break; + case llvm::ARM::ArchKind::ARMV4: +// Filter __arm_ldcl and __arm_stcl in acle.h +FeatureCoprocBF = FEATURE_COPROC_B1; +break; + case llvm::ARM::ArchKind::ARM5T: +FeatureCoprocBF = isThumb() ? 0 : FEATURE_COPROC_B1; +break; + case llvm::ARM::ArchKind::ARMV5TE: + case llvm::ARM::ArchKind::ARMV5TEJ: +if (!isThumb()) + FeatureCoprocBF = + FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | FEATURE_COPROC_B3; +break; + case llvm::ARM::ArchKind::ARMV6: + case llvm::ARM::ArchKind::ARMV6K: + case llvm::ARM::ArchKind::ARMV6KZ: + case llvm::ARM::ArchKind::ARMV6T2: +if (!isThumb() || ArchKind == llvm::ARM::ArchKind::ARMV6T2) + FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | +FEATURE_COPROC_B3 | FEATURE_COPROC_B4; +break; + case llvm::ARM::ArchKind::ARMV7A: + case llvm::ARM::ArchKind::ARMV7R: + case llvm::ARM::ArchKind::ARMV7M: + case llvm::ARM::ArchKind::ARMV7S: + case llvm::ARM::ArchKind::ARMV7EM: +FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | + FEATURE_COPROC_B3 | FEATURE_COPROC_B4; +break; + case llvm::ARM::ArchKind::ARMV8A: + case llvm::ARM::ArchKind::ARMV8R: + case llvm::ARM::ArchKind::ARMV8_1A: + case llvm::ARM::ArchKind::ARMV8_2A: + case llvm::ARM::ArchKind::ARMV8_3A: + case llvm::ARM::ArchKind::ARMV8_4A: + case llvm::ARM::ArchKind::ARMV8_5A: + case llvm::ARM::ArchKind::ARMV8_6A: + case llvm::ARM::ArchKind::ARMV8_7A: + case llvm::ARM::ArchKind::ARMV8_8A: + case llvm::ARM::ArchKind::ARMV8_9A: +// Filter __arm_cdp, __arm_ldcl, __arm_stcl in arm_acle.h +FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B3; +break; + case llvm::ARM::ArchKind::ARMV8MMainline: davemgreen wrote: Add ARMV8_1MMainline too. https://github.com/llvm/llvm-project/pull/75440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)
https://github.com/davemgreen commented: Thanks. This is looking good to me. I just have a few comments about different architecture revisions. https://github.com/llvm/llvm-project/pull/75440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)
davemgreen wrote: Thanks for doing this. I think that __ARM_FEATURE_COPROC should be a bitfield, as defined in https://arm-software.github.io/acle/main/acle.html#coprocessor-intrinsics. That would remove the need for the other macros. https://github.com/llvm/llvm-project/pull/75440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)
davemgreen wrote: This is the downstream code we have: https://gist.github.com/davemgreen/e7ade833274a60e975e67a66eda7cb44 Note that the __ARM_TARGET_COPROC_XYZ macros are probably wrong. They should be __ARM_FEATURE_COPROC bitfield macros according to the ACLE. Can you make use of some of that? It would be good to add the macro definition at the same time as the intrinsics (they can be used to control when the intrinsics are available), and the test should be useful for checking they are available at the right times. https://github.com/llvm/llvm-project/pull/75440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)
davemgreen wrote: Let me try and get the downstream version, you might be able to pick up some things from it. A test at least should probably be present. https://github.com/llvm/llvm-project/pull/75440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char
[llvm] [clang-tools-extra] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char
[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char
[llvm] [clang-tools-extra] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char
[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char
[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char
[clang-tools-extra] [llvm] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char
[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char
[llvm] [clang-tools-extra] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char
[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
https://github.com/davemgreen commented: Thanks. I think it is worth trying to get this in. I already see it triggering in a number of places, it might be worth working on making it a little more generic in followup patches if we can, but there is already quite a bit going on. https://github.com/llvm/llvm-project/pull/72273 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char
[clang-tools-extra] [clang] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,816 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Instruction *Index, +Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +PHINode *IndPhi, Value *MaxLen, Instruction *Index, +Value *Start, bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char
[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
https://github.com/davemgreen edited https://github.com/llvm/llvm-project/pull/72273 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)
davemgreen wrote: It looks like there is a downstream implementation of this that was never upstreamed. Perhaps someone can fish it out for you to show how it looked? It might be using the wrong predefined macro, but does have some tests. https://github.com/llvm/llvm-project/pull/75440 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [TargetParser] Define AEK_FCMA and AEK_JSCVT for tsv110 (PR #75516)
@@ -81,6 +81,15 @@ static bool DecodeAArch64Features(const Driver , StringRef text, else return false; +// +jsconv and +complxnum implies +neon and +fp-armv8 davemgreen wrote: I believe this ideally would not be in the driver, as it does not apply to target attributes, only -march options. https://github.com/llvm/llvm-project/pull/75516 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang-tools-extra] [LoopVectorize] Enable hoisting of runtime checks by default (PR #71538)
https://github.com/davemgreen approved this pull request. With that fixed, and from the perf Ive seen, this LGTM. Thanks https://github.com/llvm/llvm-project/pull/71538 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[openmp] [clang-tools-extra] [libcxx] [mlir] [clang] [compiler-rt] [lldb] [llvm] [libcxxabi] [flang] [MachineCopyPropagation] When the source of PreviousCopy is undef, we cannot replace sub register (
davemgreen wrote: Thanks. It sounds like there are not a lot of code changes, which is a good sign. I didn't expect the debug problems though. I'll try and take a look at the patch. Perhaps you are right that we need a new method for the debug info to use. https://github.com/llvm/llvm-project/pull/74682 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,839 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Value *Start, +Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +Value *MaxLen, Value *Index, Value *Start, +bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char AArch64LoopIdiomTransformLegacyPass::ID = 0;
[clang-tools-extra] [clang] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)
@@ -0,0 +1,839 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +//if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===--===// +// +// TODO List: +// +// * When optimizing for code size we may want to avoid some transformations. +// * We can also support the inverse case where we scan for a matching element. +// +//===--===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt +DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( +"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( +"aarch64-lit-verify", cl::Hidden, cl::init(false), +cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl ); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA, +GetElementPtrInst *GEPB, Value *Start, +Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +Value *MaxLen, Value *Index, Value *Start, +bool IncIdx, BasicBlock *FoundBB, +BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +initializeAArch64LoopIdiomTransformLegacyPassPass( +*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { +return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage ) const override { +AU.addRequired(); +AU.addRequired(); +AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager ) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +LPPassManager ) { + + if (skipLoop(L)) +return false; + + auto *DT = ().getDomTree(); + auto *LI = ().getLoopInfo(); + auto = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, , >getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char AArch64LoopIdiomTransformLegacyPass::ID = 0;