[clang] [clang][AArch64] Add getHostCPUFeatures to query for enabled features in cpu info (PR #97749)

2024-07-24 Thread David Green via cfe-commits

davemgreen wrote:

It is that bit of code, yeah. I don't know of a way to reproduce this without 
logging into different machines with different sets of options and trying it.

If we had a way to test/mock that various /proc/cpuinfo files gave us the 
correct results, that would be helpful in giving us more confidence it was 
working as intended.

https://github.com/llvm/llvm-project/pull/97749
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Implement NEON vamin/vamax intrinsics (PR #99041)

2024-07-16 Thread David Green via cfe-commits

https://github.com/davemgreen commented:

Did you consider emitting `llvm.fmin(llvm.fabs(x), llvm.fabs(y))`?

https://github.com/llvm/llvm-project/pull/99041
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AArch64] Add getHostCPUFeatures to query for enabled features in cpu… (PR #97749)

2024-07-05 Thread David Green via cfe-commits

davemgreen wrote:

Hi this sounds like a good idea to me. Note that the implementation of 
getHostCPUFeatures isn't amazing for AArch64 at the moment, there was an 
attempt to fix it up in #95694 (but thaat has gone a bit quiet). One point we 
noticed is that it could end up turning "aes+sha2" into "crypto" and "crypto" 
back into "sha2+aes+sha3+sm4", as it uses the old meaning of "crypto"

https://github.com/llvm/llvm-project/pull/97749
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][NEON] Add intrinsics for LUTI (PR #96883)

2024-06-28 Thread David Green via cfe-commits

https://github.com/davemgreen commented:

Thanks this looks great. I've not checked the C / ACLE intrinsics though - I 
will defer to @CarolineConcatto and @momchil-velikov for those parts if that is 
OK.

https://github.com/llvm/llvm-project/pull/96883
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][NEON] Add intrinsics for LUTI (PR #96883)

2024-06-27 Thread David Green via cfe-commits


@@ -6420,6 +6420,76 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
 let Predicates = [HasLUT] in {
   defm LUT2 : BaseSIMDTableLookupIndexed2<"luti2">;
   defm LUT4 : BaseSIMDTableLookupIndexed4<"luti4">;
+
+  def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v8i8 V64:$Rn), 
+  (v8i8 V64:$Rm), (i32 VectorIndexS32b_timm:$idx))),
+  (LUT2_B (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),  VectorIndexS32b_timm:$idx)>;
+  def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v8i8 V64:$Rn), 
+  (v16i8 V128:$Rm), (i32 VectorIndexS32b_timm:$idx))),
+  (LUT2_B (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm,  
VectorIndexS32b_timm:$idx)>;
+  def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v16i8 V128:$Rn), 
+  (v8i8 V64:$Rm), (i32 VectorIndexS32b_timm:$idx))),
+  (LUT2_B V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),  
VectorIndexS32b_timm:$idx)>;
+  def : Pat<(v16i8 (int_aarch64_neon_vluti2_lane (v16i8 V128:$Rn), 
+  (v16i8 V128:$Rm), (i32 VectorIndexS32b_timm:$idx))),
+  (LUT2_B V128:$Rn, V128:$Rm,  VectorIndexS32b_timm:$idx)>;
+  def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v4i16 V64:$Rn), 
+(v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))),
+(LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),  VectorIndexH32b_timm:$idx)>;
+  def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v4f16 V64:$Rn), 
+(v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))),
+(LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),  VectorIndexH32b_timm:$idx)>;
+  def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v4i16 V64:$Rn), 
+(v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))),
+(LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm,  
VectorIndexH32b_timm:$idx)>;
+  def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v4f16 V64:$Rn), 
+(v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))),
+(LUT2_H (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), V128:$Rm,  
VectorIndexH32b_timm:$idx)>;
+  def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v8i16 V128:$Rn), 
+(v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))),
+(LUT2_H V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),  
VectorIndexH32b_timm:$idx)>;
+  def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v8f16 V128:$Rn), 
+(v8i8 V64:$Rm), (i32 VectorIndexH32b_timm:$idx))),
+(LUT2_H V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),  
VectorIndexH32b_timm:$idx)>;
+  def : Pat<(v8i16 (int_aarch64_neon_vluti2_lane (v8i16 VecListOne8h:$Rn), 
+(v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))),
+(LUT2_H VecListOne8h:$Rn, V128:$Rm,  VectorIndexH32b_timm:$idx)>;
+  def : Pat<(v8f16 (int_aarch64_neon_vluti2_lane (v8f16 VecListOne8h:$Rn), 
+(v16i8 V128:$Rm), (i32 VectorIndexH32b_timm:$idx))),
+(LUT2_H VecListOne8h:$Rn, V128:$Rm,  VectorIndexH32b_timm:$idx)>;
+
+  def : Pat<(v16i8 (int_aarch64_neon_vluti4q_laneq (v16i8 VecListOne16b:$Rn), 
+(v16i8 V128:$Rm), (i32 VectorIndexD32b_timm:$idx))),
+(LUT4_B VecListOne16b:$Rn, V128:$Rm,  VectorIndexD32b_timm:$idx)>;
+
+  def : Pat<(v8i16 (int_aarch64_neon_vluti4q_laneq_x2 (v8i16 
VecListOne8h:$Rn1), 
+(v8i16 VecListOne8h:$Rn2), (v16i8 V128:$Rm), 
+(i32 VectorIndexS32b_timm:$idx))),
+(LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, 
VecListOne8h:$Rn2, qsub1), V128:$Rm,  VectorIndexS32b_timm:$idx)>;
+  def : Pat<(v8f16 (int_aarch64_neon_vluti4q_laneq_x2 (v8f16 
VecListOne8h:$Rn1), 
+(v8f16 VecListOne8h:$Rn2), (v16i8 V128:$Rm), 
+(i32 VectorIndexS32b_timm:$idx))),
+(LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, 
VecListOne8h:$Rn2, qsub1), V128:$Rm,  VectorIndexS32b_timm:$idx)>;
+}
+
+let Predicates = [HasLUT, HasBF16] in {

davemgreen wrote:

I think you can make this HasLUT only without needing HasBF16, like the fp16 
versions above. Unless that doesn't work? It should only really be dependent on 
the size of the register (and HasLUT, obviously).

You might be able to make a multiclass too for the Pats with a parameter for 
the type, if they could shares a lot of the same code.

https://github.com/llvm/llvm-project/pull/96883
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][NEON] Add intrinsics for LUTI (PR #96883)

2024-06-27 Thread David Green via cfe-commits


@@ -2096,3 +2096,19 @@ let ArchGuard = "defined(__aarch64__) || 
defined(__arm64ec__)", TargetGuard = "r
   def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
   def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
 }
+
+//Lookup table read with 2-bit/4-bit indices

davemgreen wrote:

// Lookup

https://github.com/llvm/llvm-project/pull/96883
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [Clang] Bring initFeatureMap back to AArch64TargetInfo. (PR #96832)

2024-06-27 Thread David Green via cfe-commits

davemgreen wrote:

Could you explain more about what broke? Are you using target(..) attributes?

https://github.com/llvm/llvm-project/pull/96832
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] b635d69 - [NFC] Fix laod -> load typos. NFC

2024-06-21 Thread David Green via cfe-commits

Author: David Green
Date: 2024-06-21T09:26:44+01:00
New Revision: b635d690ed1e3fbebab9dee1b157fa380d3e9eba

URL: 
https://github.com/llvm/llvm-project/commit/b635d690ed1e3fbebab9dee1b157fa380d3e9eba
DIFF: 
https://github.com/llvm/llvm-project/commit/b635d690ed1e3fbebab9dee1b157fa380d3e9eba.diff

LOG: [NFC] Fix laod -> load typos. NFC

Added: 


Modified: 
clang/lib/Sema/SemaDeclCXX.cpp
clang/test/SemaHLSL/ScalarOverloadResolution.hlsl
clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl
llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll
llvm/test/CodeGen/X86/load-partial-dot-product.ll
llvm/test/tools/yaml2obj/COFF/load-config.yaml

Removed: 




diff  --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index d38700d56e4ff..c1189e6935dc9 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -16111,7 +16111,7 @@ ExprResult Sema::BuildCXXConstructExpr(
 CXXConstructionKind ConstructKind, SourceRange ParenRange) {
   if (auto *Shadow = dyn_cast(FoundDecl)) {
 Constructor = findInheritingConstructor(ConstructLoc, Constructor, Shadow);
-// The only way to get here is if we did overlaod resolution to find the
+// The only way to get here is if we did overload resolution to find the
 // shadow decl, so we don't need to worry about re-checking the trailing
 // requires clause.
 if (DiagnoseUseOfOverloadedDecl(Constructor, ConstructLoc))

diff  --git a/clang/test/SemaHLSL/ScalarOverloadResolution.hlsl 
b/clang/test/SemaHLSL/ScalarOverloadResolution.hlsl
index 41702ef175320..d1a47af228e24 100644
--- a/clang/test/SemaHLSL/ScalarOverloadResolution.hlsl
+++ b/clang/test/SemaHLSL/ScalarOverloadResolution.hlsl
@@ -72,7 +72,7 @@ void Case1(half H, float F, double D) {
   HalfFloatDouble(D);
 }
 
-// Case 2: A function declared with double and float overlaods.
+// Case 2: A function declared with double and float overloads.
 //   (a) When called with half, it will resolve to float because float is lower
 //   ranked than double.
 //   (b) When called with float it will resolve to float because float is an

diff  --git a/clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl 
b/clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl
index 12575084ead2b..bbf8d3b5e102c 100644
--- a/clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl
+++ b/clang/test/SemaHLSL/VectorElementOverloadResolution.hlsl
@@ -71,7 +71,7 @@ void Case1(half2 H, float2 F, double2 D) {
   HalfFloatDouble(D);
 }
 
-// Case 2: A function declared with double and float overlaods.
+// Case 2: A function declared with double and float overloads.
 //   (a) When called with half, it will resolve to float because float is lower
 //   ranked than double.
 //   (b) When called with float it will resolve to float because float is an

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 46d44704af5a7..cfe9f33efc91b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -100,7 +100,7 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
   bool matchRemoveFcanonicalize(MachineInstr , Register ) const;
 
   // Combine unsigned buffer load and signed extension instructions to generate
-  // signed buffer laod instructions.
+  // signed buffer load instructions.
   bool matchCombineSignExtendInReg(
   MachineInstr , std::pair ) const;
   void applyCombineSignExtendInReg(

diff  --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp 
b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 471c7ca4d7356..4e515e05c842a 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -57,7 +57,7 @@
 //
 // base = gep a, 0, x, y
 // load base
-// laod base + 1  * sizeof(float)
+// load base + 1  * sizeof(float)
 // load base + 32 * sizeof(float)
 // load base + 33 * sizeof(float)
 //

diff  --git a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll 
b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll
index 03b41db2291a2..6a95859c7692d 100644
--- a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll
+++ b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll
@@ -22,7 +22,7 @@ define double @ld_double(ptr %p) speculative_load_hardening {
 entry:
   %0 = load double, ptr %p, align 8
   ret double %0
-; Checking that the address laoded from is masked for a floating point load.
+; Checking that the address loaded from is masked for a floating point load.
 ; CHECK-LABEL: ld_double
 ; CHECK:  cmp sp, #0
 ; CHECK-NEXT: csetm x16, ne
@@ -43,7 +43,7 @@ entry:
   %iszero = icmp eq 

[clang] [llvm] [AArch64] Add ability to list extensions enabled for a target (PR #95805)

2024-06-18 Thread David Green via cfe-commits


@@ -19,3 +19,19 @@
 // RUN: %clang --target=arm64 -mlittle-endian -march=armv8.1a -### -c %s 2>&1 
| FileCheck -check-prefix=ARM64-GENERICV81A %s
 // RUN: %clang --target=arm64 -mlittle-endian -march=armv8.1-a -### -c %s 2>&1 
| FileCheck -check-prefix=ARM64-GENERICV81A %s
 // ARM64-GENERICV81A: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" 
"generic"{{.*}} "-target-feature" "+v8.1a"{{.*}} "-target-feature" "+neon"
+
+// = Architecture extensions =
+
+// RUN: %clang -target aarch64 -march=armv8.1-a --print-enabled-extensions 
2>&1 | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_VHE
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_CRC32
+// FIXME: FEAT_FP is optional from v8.0a
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_RDM
+// FIXME: FEAT_AdvSIMD is optional from v8.0a

davemgreen wrote:

There is a difference between what the Arm architecture technically describes 
as optional and what the platform enables by default (i.e. what we enable by 
default for the compiler in Armv8-a). We don't want everyone in the world to 
lose out on Neon vectorization and require soft-fp just because it is 
technically optional in the architecture. The same idea has been applied to 
SVE2 in Armv9 (although that one has become more debatable). Some of the other 
optional extensions seem like they might be system-register extensions, I 
imagine that might be why they are enabled. There might be a better way to 
handle that.

So I'm not sure if there is anything that needs to be "fixed" here, and think I 
would recommend removing them (especially from fp/asimd, the others are more 
debatable and could stay if you think they are useful).

https://github.com/llvm/llvm-project/pull/95805
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add ability to list extensions enabled for a target (PR #95805)

2024-06-18 Thread David Green via cfe-commits


@@ -140,89 +152,480 @@ def FeatureAES : Extension<
 // compatibility, and now imply features SHA2 and AES, which was the
 // "traditional" meaning of Crypto.
 let FMVDependencies = "+aes,+sha2" in
-def FeatureCrypto : Extension<"crypto", "Crypto",
+def FeatureCrypto : ExtensionWithMArch<"crypto", "Crypto", "FEAT_Crypto",
   "Enable cryptographic instructions", [FeatureNEON, FeatureSHA2, FeatureAES]>;
 
-def FeatureCRC : Extension<"crc", "CRC",
-  "Enable ARMv8 CRC-32 checksum instructions (FEAT_CRC32)", [],
+def FeatureCRC : ExtensionWithMArch<"crc", "CRC", "FEAT_CRC32",
+  "Enable ARMv8 CRC-32 checksum instructions", [],
   "FEAT_CRC", "+crc", 110>;
 
-def FeatureRAS : Extension<"ras", "RAS",
-  "Enable ARMv8 Reliability, Availability and Serviceability Extensions 
(FEAT_RAS, FEAT_RASv1p1)">;
-
-def FeatureRASv2 : Extension<"rasv2", "RASv2",
-  "Enable ARMv8.9-A Reliability, Availability and Serviceability Extensions 
(FEAT_RASv2)",
-  [FeatureRAS]>;
-
-def FeatureLSE : Extension<"lse", "LSE",
-  "Enable ARMv8.1 Large System Extension (LSE) atomic instructions 
(FEAT_LSE)", [],
-  "FEAT_LSE", "+lse", 80>;
+// This SubtargetFeature is special. It controls only whether codegen will turn
+// `llvm.readcyclecounter()` into an access to a PMUv3 System Register. The
+// `FEAT_PMUv3*` system registers are always available for 
assembly/disassembly.
+let MArchName = "pmuv3" in
+def FeaturePerfMon : ExtensionWithMArch<"perfmon", "PerfMon", "FEAT_PMUv3",
+  "Enable Code Generation for ARMv8 PMUv3 Performance Monitors extension">;
 
-def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true",
-  "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules 
(FEAT_LSE2)">;
+def FeatureSpecRestrict : Extension<"specrestrict", "SpecRestrict", 
"FEAT_CSV2_2",
+  "Enable architectural speculation restriction">;
 
-def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", 
"OutlineAtomics", "true",
-  "Enable out of line atomics to support LSE instructions">;
+//===--===//
+//  Armv8.1 Architecture Extensions
+//===--===//
 
-def FeatureFMV : SubtargetFeature<"fmv", "HasFMV", "true",
-  "Enable Function Multi Versioning support.">;
+def FeatureLSE : ExtensionWithMArch<"lse", "LSE", "FEAT_LSE",
+  "Enable ARMv8.1 Large System Extension (LSE) atomic instructions", [],
+  "FEAT_LSE", "+lse", 80>;
 
 let MArchAlias = "rdma" in
-def FeatureRDM : Extension<"rdm", "RDM",
-  "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions 
(FEAT_RDM)",
+def FeatureRDM : ExtensionWithMArch<"rdm", "RDM", "FEAT_RDM",
+  "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions",
   [FeatureNEON],
   "FEAT_RDM", "+rdm,+fp-armv8,+neon", 108>;
 
-def FeaturePAN : SubtargetFeature<
-"pan", "HasPAN", "true",
-"Enables ARM v8.1 Privileged Access-Never extension (FEAT_PAN)">;
+def FeaturePAN : Extension<"pan", "PAN", "FEAT_PAN",
+  "Enables ARM v8.1 Privileged Access-Never extension">;
 
-def FeatureLOR : SubtargetFeature<
-"lor", "HasLOR", "true",
-"Enables ARM v8.1 Limited Ordering Regions extension (FEAT_LOR)">;
+def FeatureLOR : Extension<"lor", "LOR", "FEAT_LOR",
+  "Enables ARM v8.1 Limited Ordering Regions extension">;
 
 def FeatureCONTEXTIDREL2 : SubtargetFeature<"CONTEXTIDREL2", 
"HasCONTEXTIDREL2",
 "true", "Enable RW operand CONTEXTIDR_EL2" >;
 
-def FeatureVH : SubtargetFeature<"vh", "HasVH", "true",
-"Enables ARM v8.1 Virtual Host extension (FEAT_VHE)", 
[FeatureCONTEXTIDREL2] >;
+def FeatureVH : Extension<"vh", "VH", "FEAT_VHE",
+  "Enables ARM v8.1 Virtual Host extension", [FeatureCONTEXTIDREL2] >;
 
-// This SubtargetFeature is special. It controls only whether codegen will turn
-// `llvm.readcyclecounter()` into an access to a PMUv3 System Register. The
-// `FEAT_PMUv3*` system registers are always available for 
assembly/disassembly.
-let MArchName = "pmuv3" in
-def FeaturePerfMon : Extension<"perfmon", "PerfMon",
-  "Enable Code Generation for ARMv8 PMUv3 Performance Monitors extension 
(FEAT_PMUv3)">;
+//===--===//
+//  Armv8.2 Architecture Extensions
+//===--===//
+
+def FeatureSM4 : ExtensionWithMArch<"sm4", "SM4", "FEAT_SM4, FEAT_SM3",
+  "Enable SM3 and SM4 support", [FeatureNEON],
+  "FEAT_SM4", "+sm4,+fp-armv8,+neon", 106>;
+
+def FeatureSHA3 : ExtensionWithMArch<"sha3", "SHA3", "FEAT_SHA3, FEAT_SHA512",
+  "Enable SHA512 and SHA3 support", [FeatureNEON, FeatureSHA2],
+  "FEAT_SHA3", "+sha3,+sha2,+fp-armv8,+neon", 140>;
+
+def FeatureRAS : ExtensionWithMArch<"ras", "RAS", "FEAT_RAS, FEAT_RASv1p1",
+  "Enable ARMv8 Reliability, Availability and Serviceability Extensions">;
 
 let ArchExtKindSpelling = "AEK_FP16", MArchName = "fp16" in
-def FeatureFullFP16 : Extension<"fullfp16", 

[clang] [llvm] [AArch64] Add ability to list extensions enabled for a target (PR #95805)

2024-06-18 Thread David Green via cfe-commits


@@ -19,3 +19,19 @@
 // RUN: %clang --target=arm64 -mlittle-endian -march=armv8.1a -### -c %s 2>&1 
| FileCheck -check-prefix=ARM64-GENERICV81A %s
 // RUN: %clang --target=arm64 -mlittle-endian -march=armv8.1-a -### -c %s 2>&1 
| FileCheck -check-prefix=ARM64-GENERICV81A %s
 // ARM64-GENERICV81A: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" 
"generic"{{.*}} "-target-feature" "+v8.1a"{{.*}} "-target-feature" "+neon"
+
+// = Architecture extensions =
+
+// RUN: %clang -target aarch64 -march=armv8.1-a --print-enabled-extensions 
2>&1 | FileCheck -check-prefix=ARCH-EXTENSION --implicit-check-not FEAT_ %s
+// ARCH-EXTENSION: FEAT_ETE
+// ARCH-EXTENSION: FEAT_LOR
+// ARCH-EXTENSION: FEAT_TRBE
+// ARCH-EXTENSION: FEAT_VHE
+// ARCH-EXTENSION: FEAT_PAN
+// ARCH-EXTENSION: FEAT_CRC32
+// FIXME: FEAT_FP is optional from v8.0a
+// ARCH-EXTENSION: FEAT_FP
+// ARCH-EXTENSION: FEAT_LSE
+// ARCH-EXTENSION: FEAT_RDM
+// FIXME: FEAT_AdvSIMD is optional from v8.0a

davemgreen wrote:

Why do these have FIXMEs?

https://github.com/llvm/llvm-project/pull/95805
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang] Reland Add tanf16 builtin and support for tan constrained intrinsic (PR #94559)

2024-06-13 Thread David Green via cfe-commits

davemgreen wrote:

I believe they were added so long ago that the default Expanding wasn't done at 
the time. @efriedma-quic do you have more of an idea than that?

https://github.com/llvm/llvm-project/pull/94559
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang] Reland Add tanf16 builtin and support for tan constrained intrinsic (PR #94559)

2024-06-13 Thread David Green via cfe-commits

davemgreen wrote:

Usually when new ISD nodes are added they are expanded for all types, so that 
every backend will get at least working code even if it is not optimal. The 
targets can then come along and override the defaults for the types they are 
interested in, to get better results.

For tan I would expect most vector types would want to scalarize, so marking 
them as expand would make sense. If more types than are necessary get marked as 
Expand that shouldn't be an issue, it looks like we already do that for a 
number of other nodes.

https://github.com/llvm/llvm-project/pull/94559
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang] Reland Add tanf16 builtin and support for tan constrained intrinsic (PR #94559)

2024-06-12 Thread David Green via cfe-commits

davemgreen wrote:

If you remove tan from isTriviallyVectorizable it should prevent vectorization 
in the short term.

It might be better to default FTAN to expand in 
https://github.com/llvm/llvm-project/blob/64c9a1e1266ec7bc4c4896b2df116fa12dbacf15/llvm/lib/CodeGen/TargetLoweringBase.cpp#L960,
 which seems to only be done for f32/f64/f128 at the moment.

https://github.com/llvm/llvm-project/pull/94559
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Cortex-A725 and Cortex-X925 (PR #95214)

2024-06-12 Thread David Green via cfe-commits

https://github.com/davemgreen approved this pull request.

Thanks!

https://github.com/llvm/llvm-project/pull/95214
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Cortex-A725 and Cortex-X925 (PR #95214)

2024-06-12 Thread David Green via cfe-commits


@@ -723,6 +746,9 @@ def ProcessorFeatures {
  FeaturePerfMon, FeatureETE, FeatureTRBE,
  FeatureSPE, FeatureMTE, FeatureSVE2BitPerm,
  FeatureFP16FML, FeatureSPE_EEF];
+  list X925 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,

davemgreen wrote:

Should this include FeatureSVE2BitPerm? It is included in the AEK_ list, and X4 
the features.

Same for the A725 features.

https://github.com/llvm/llvm-project/pull/95214
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [ARM] Add support for Cortex-R52+ (PR #94633)

2024-06-07 Thread David Green via cfe-commits


@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52 | FileCheck %s 
--check-prefix=CHECK --check-prefix=USEAA
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52plus | FileCheck %s 
--check-prefix=CHECK --check-prefix=USEAA

davemgreen wrote:

Adding these run lines for some of these tests may not be necessary for both 
r52 and r52+. I think I would drop it from the scheduling/useaa tests if it 
just uses the same scheduling model.

https://github.com/llvm/llvm-project/pull/94633
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [ARM] Add support for Cortex-R52+ (PR #94633)

2024-06-07 Thread David Green via cfe-commits


@@ -90,6 +90,8 @@ def ProcR7  : SubtargetFeature<"r7", "ARMProcFamily", 
"CortexR7",
"Cortex-R7 ARM processors", []>;
 def ProcR52 : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52",
"Cortex-R52 ARM processors", []>;
+def ProcR52plus  : SubtargetFeature<"r52plus", "ARMProcFamily", 
"CortexR52plus",

davemgreen wrote:

This could maybe just use ProcR52

https://github.com/llvm/llvm-project/pull/94633
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [ARM] Add support for Cortex-R52+ (PR #94633)

2024-06-07 Thread David Green via cfe-commits

https://github.com/davemgreen approved this pull request.

LGTM, thanks

https://github.com/llvm/llvm-project/pull/94633
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [ARM] Add support for Cortex-R52+ (PR #94633)

2024-06-07 Thread David Green via cfe-commits

https://github.com/davemgreen edited 
https://github.com/llvm/llvm-project/pull/94633
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Decouple feature dependency expansion. (PR #94279)

2024-06-06 Thread David Green via cfe-commits

davemgreen wrote:

Yeah I had just seen that error message before you edited your comment. There 
are some examples of neon I found in a quick search, which were presumably 
added for AArch32:
https://github.com/aaru-dps/Aaru.Checksums.Native/blob/bd5051ce181b225a7662bfb764ebcc5cbe7542b2/simd.h#L112
https://github.com/mooch443/commons/blob/30dc797430968831959d77d7f2503cec3518a13a/common/misc/PVBlob.cpp#L385
I'm not sure if that is reason enough to still support it.

But like I said, if I try this patch locally then `target("neon")` seems to be 
accepted fine (no errors). It is the same for other features like 
`target("fullfp16")`, which seem to enable `+fullfp16` in the backend. 
`"noneon"` is no longer accepted, which might be fine as I don't believe 
negative features are commonly used. (For aarch64 from a baseline of armv8 they 
are mostly additive. They are likely to become more common going forward but 
new users can use the "right" attribute names).

https://github.com/llvm/llvm-project/pull/94279
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Decouple feature dependency expansion. (PR #94279)

2024-06-05 Thread David Green via cfe-commits

https://github.com/davemgreen commented:

> LGTM. The main change to point out is that the target attribute will no 
> longer accept internal feature names. I don't think it should ever have done 
> so, but we should get input from others. @davemgreen? There are references to 
> existing code in [D137617](https://reviews.llvm.org/D137617) but no details. 
> If this has been used for e.g. intrinsics definitions, I am surprised there 
> are not more test failures.

Hi - It was intentional to support older versions of clang. The target 
attributes already had users before I fixed them to support the same formats as 
GCC for AArch64, and was aiming at not breaking the existing code. IIRC There 
are quite a few uses of things like `target("crypto")` out there (without the + 
that gcc wants to include).

I'm not sure if that extends to internal feature names a lot. Not supporting 
"neon" as a name would seem like a mistake if it was removed, but I don't 
believe this patch does that. If it only effects negative features those have 
never worked particularly well.

https://github.com/llvm/llvm-project/pull/94279
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Cortex-A725 and Cortex-X925 (PR #93978)

2024-05-31 Thread David Green via cfe-commits


@@ -863,6 +889,8 @@ def : ProcessorModel<"cortex-a720", NeoverseN2Model, 
ProcessorFeatures.A720,
  [TuneA720]>;
 def : ProcessorModel<"cortex-a720ae", NeoverseN2Model, 
ProcessorFeatures.A720AE,
  [TuneA720AE]>;
+def : ProcessorModel<"cortex-a725", CortexA57Model, ProcessorFeatures.A725,

davemgreen wrote:

NeoverseN2Model

https://github.com/llvm/llvm-project/pull/93978
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Cortex-A725 and Cortex-X925 (PR #93978)

2024-05-31 Thread David Green via cfe-commits


@@ -877,6 +905,8 @@ def : ProcessorModel<"cortex-x3", NeoverseN2Model, 
ProcessorFeatures.X3,
  [TuneX3]>;
 def : ProcessorModel<"cortex-x4", NeoverseN2Model, ProcessorFeatures.X4,
  [TuneX4]>;
+def : ProcessorModel<"cortex-x925", NeoverseN2Model, ProcessorFeatures.X925,

davemgreen wrote:

Maybe NeoverseV2Model, as the best fit even if it is quite different? (Some of 
the older cortex-x cores look like they could be using a newer model too, but 
that is a different issue).

https://github.com/llvm/llvm-project/pull/93978
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)

2024-05-13 Thread David Green via cfe-commits


@@ -189,15 +189,15 @@ define i32 @shr(i32 %a, i32 %b) {
 
 
 define i1 @outer_and1(i1 %a) {
-; check-label: @outer_and1(
-; check-not: call i1 @and1
+; check-LABEL: @outer_and1(

davemgreen wrote:

I've regenerated the check lines in 220756f1f92b335cbafdff67c570d096a6925d87.

https://github.com/llvm/llvm-project/pull/91854
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)

2024-05-13 Thread David Green via cfe-commits


@@ -121,7 +121,7 @@ define i32 @test_orr_extract_from_mul_1(i32 %x, i32 %y) {
 ; CHECK-THUMB-NEXT:orrs r0, r1
 ; CHECK-THUMB-NEXT:bx lr
 entry:
-; CHECk-THUMB: orrs r0, r1

davemgreen wrote:

I believe we can delete this line, it was left in from the old checks.

https://github.com/llvm/llvm-project/pull/91854
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)

2024-05-13 Thread David Green via cfe-commits


@@ -189,15 +189,15 @@ define i32 @shr(i32 %a, i32 %b) {
 
 
 define i1 @outer_and1(i1 %a) {
-; check-label: @outer_and1(
-; check-not: call i1 @and1
+; check-LABEL: @outer_and1(

davemgreen wrote:

Should all these be "CHECK"?

https://github.com/llvm/llvm-project/pull/91854
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)

2024-05-13 Thread David Green via cfe-commits


@@ -217,42 +217,42 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr 
%src) {
 }
 
 define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) {
-; check-label: volatile_load_v3i8_to_4xi32:
+; check-LABEL: volatile_load_v3i8_to_4xi32:

davemgreen wrote:

I think we can delete these. The real check lines are below.

https://github.com/llvm/llvm-project/pull/91854
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)

2024-05-13 Thread David Green via cfe-commits


@@ -22,7 +22,7 @@ define signext i8 @test1(i32 %A) {
 ; CHECK-V7:   @ %bb.0:
 ; CHECK-V7-NEXT:sbfx r0, r0, #8, #8
 ; CHECK-V7-NEXT:bx lr
-; CHECk-V7: sbfx r0, r0, #8, #8
+; CHECK-V7: sbfx r0, r0, #8, #8

davemgreen wrote:

I believe we can delete this line, it was left in from the old checks.

https://github.com/llvm/llvm-project/pull/91854
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)

2024-05-13 Thread David Green via cfe-commits

https://github.com/davemgreen commented:

The Arm/AArch64 tests looks OK for the most part. I might be able to help with 
some of them if that is easier than trying to sort them all here.

https://github.com/llvm/llvm-project/pull/91854
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [flang] [llvm] [mlir] [polly] [test]: fix filecheck annotation typos (PR #91854)

2024-05-13 Thread David Green via cfe-commits

https://github.com/davemgreen edited 
https://github.com/llvm/llvm-project/pull/91854
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lldb] [llvm] [AArch64][TargetParser] autogen ArchExtKind enum - renaming (PR #90320)

2024-05-03 Thread David Green via cfe-commits

davemgreen wrote:

Rust 
(https://github.com/rust-lang/rust/blob/79734f1db8dbe322192dea32c0f6b80ab14c4c1d/compiler/rustc_codegen_llvm/src/llvm_util.rs#L229)
 and zig 
(https://github.com/ziglang/zig/blob/44db92d1ca90c9cfdfb29fe46f04ff8f11c80901/lib/std/Target/aarch64.zig#L43)
 are two examples of what I meant by external dependencies. They can adapt, 
especially if there are release notes, but there may be many more projects out 
there using the old names and it would be good if they kept working, if we can.

https://github.com/llvm/llvm-project/pull/90320
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lldb] [llvm] [AArch64][TargetParser] autogen ArchExtKind enum - renaming (PR #90320)

2024-05-03 Thread David Green via cfe-commits

davemgreen wrote:

@tmatheson-arm reached out and we have a bit of a conversation internally. I do 
think that there is too much going on in this one pr to be sensible to review, 
but from what I've looked at my main points I think are:
 - Some AEK names get renamed in ways I would not expect them to, like AEK_FP16 
being changed to AEK_FULLFP16. The command-line names or FEAT_ names should 
probably be what we are aiming for if we are changing them one-way or the other.
 -  Some of the backend features have been renamed in ways that could cause 
breakages for external users, like +complxnum becoming +fcma. The new name is 
better, but ideally the old name would continue to work (maybe by adding an 
alias/extra target feature dependant on the new name? That might not work with 
negative features).
 - Some of changes do look mechanical and a good change (P1->p1, V2->v2 etc), 
and if they were separate they would be easy to get in and out of the way of 
the contentious stuff that remains.
 - The ones that have changed should have release notes.

https://github.com/llvm/llvm-project/pull/90320
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [ARM] Armv8-R does not require fp64 or neon. (PR #88287)

2024-05-02 Thread David Green via cfe-commits

davemgreen wrote:

> which change? Specifying -mcpu=cortex-r52 will behave the same way as before. 
> The original manual for the R52 provided for a no-neon sp-only variant, and 
> they exist in the wild, and this lets "architecture-generic" builds 
> automatically support both.

I just meant the break in functionality for existing users of -march=armv8r, 
and the drop in performance from less R52 tuning features/scheduling and the 
lack of Neon. I believe this is more correct, but users in the past could still 
get the same results by specifying `-march=armv8r -mfpu=fpv5-sp-d16`. I'm 
hoping someone else can come along and have an opinion on it one way or another.

https://github.com/llvm/llvm-project/pull/88287
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lldb] [llvm] [AArch64][TargetParser] autogen ArchExtKind enum - renaming (PR #90320)

2024-05-02 Thread David Green via cfe-commits

davemgreen wrote:

> This is already split into 18 commits, I don't think there's any reason to 
> split it into 18 PRs, since comments on one of them likely apply to the 
> others.

I disagree. This is going to be awkward for a lot of users of llvm and contains 
at least some details I don't agree with. I think it will can cause a lot of 
subtle bugs and can end up wasting a lot of peoples time. It should at least be 
awkward for us too.

https://github.com/llvm/llvm-project/pull/90320
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [lldb] [llvm] [AArch64][TargetParser] autogen ArchExtKind enum - renaming (PR #90320)

2024-05-02 Thread David Green via cfe-commits

davemgreen wrote:

IMO This patch looks far too large to sensibly review and needs to be split up. 
A lot of the changes don't really looks like mechanical renamings, and it is 
hard to see how they would not break existing uses of llvm arch64 target 
features?

https://github.com/llvm/llvm-project/pull/90320
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] Reapply "[Clang][Sema] Diagnose class member access expressions naming non-existent members of the current instantiation prior to instantiation in the absence of dependent

2024-05-02 Thread David Green via cfe-commits

davemgreen wrote:

Hi - We've ran into a couple of places where this causes problems, one of them 
in running Spec as above. Is it possible to turn off this error for older 
codebases with a flag, turning it into a warning? It doesn't seem like a very 
useful error if it applies to code that is never used. 
https://godbolt.org/z/Pf83EW7vE

https://github.com/llvm/llvm-project/pull/90152
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Cortex-R82AE and improve Cortex-R82 (PR #90440)

2024-04-30 Thread David Green via cfe-commits

https://github.com/davemgreen approved this pull request.

Thanks. LGTM

https://github.com/llvm/llvm-project/pull/90440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Cortex-R82AE and improve Cortex-R82 (PR #90440)

2024-04-30 Thread David Green via cfe-commits


@@ -632,7 +632,18 @@ inline constexpr CpuInfo CpuInfos[] = {
AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM,
AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
AArch64::AEK_PREDRES, AArch64::AEK_PROFILE})},
-{"cortex-r82", ARMV8R, AArch64::ExtensionBitset({AArch64::AEK_LSE})},
+{"cortex-r82", ARMV8R,
+ AArch64::ExtensionBitset({AArch64::AEK_CRC, AArch64::AEK_DOTPROD,

davemgreen wrote:

Can you check if ARMV8R enables AEK_LSE? There is a flip(), which might be 
turning it off?

https://github.com/llvm/llvm-project/pull/90440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Cortex-R82AE and improve Cortex-R82 (PR #90440)

2024-04-29 Thread David Green via cfe-commits


@@ -143,6 +143,7 @@ void AArch64Subtarget::initializeProperties(bool 
HasMinSize) {
   case CortexA78AE:
   case CortexA78C:
   case CortexR82:
+  case CortexR82AE:

davemgreen wrote:

Can you move both of these into the block with CortexA55? It has always been in 
the wrong place compared to the other cpus around it and the alignments it sets.

https://github.com/llvm/llvm-project/pull/90440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Cortex-R82AE and improve Cortex-R82 (PR #90440)

2024-04-29 Thread David Green via cfe-commits


@@ -632,7 +632,18 @@ inline constexpr CpuInfo CpuInfos[] = {
AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM,
AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
AArch64::AEK_PREDRES, AArch64::AEK_PROFILE})},
-{"cortex-r82", ARMV8R, AArch64::ExtensionBitset({AArch64::AEK_LSE})},
+{"cortex-r82", ARMV8R,
+ AArch64::ExtensionBitset({AArch64::AEK_CRC, AArch64::AEK_DOTPROD,

davemgreen wrote:

Are these implied by ARMV8R? Is there an advantage to having them specified 
separately too?

https://github.com/llvm/llvm-project/pull/90440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Neoverse-N3, Neoverse-V3 and Neoverse-V3AE (PR #90143)

2024-04-26 Thread David Green via cfe-commits

https://github.com/davemgreen approved this pull request.

Thanks. I didn't check the enabled features but the tunings look good to me.

https://github.com/llvm/llvm-project/pull/90143
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Neoverse-N3, Neoverse-V3 and Neoverse-V3AE (PR #90143)

2024-04-26 Thread David Green via cfe-commits


@@ -447,6 +447,16 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", 
"ARMProcFamily", "NeoverseN2
   FeatureEnableSelectOptimize,
   FeaturePredictableSelectIsExpensive]>;
 
+def TuneNeoverseN3 : SubtargetFeature<"neoversen3", "ARMProcFamily", 
"NeoverseN3",
+  "Neoverse N3 ARM processors", [
+  FeatureFuseAES,
+  FeaturePostRAScheduler,
+  FeatureCmpBccFusion,

davemgreen wrote:

Hi - Should FeatureCmpBccFusion be enabled over what is in N2? The core might 
well be able to fuse them, but I don't think that is new and from what I 
remember the performance was sometimes worse with it enabled. (I think llvm's 
implementation of FeatureCmpBccFusion might be a bit more aggressive than is 
helpful).

https://github.com/llvm/llvm-project/pull/90143
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [ARM] Armv8-R does not require fp64 or neon. (PR #88287)

2024-04-24 Thread David Green via cfe-commits

davemgreen wrote:

I'm not sure I would make this change, mostly due to it potentially causing a 
break for existing users and making performance worse, but can see the 
reasoning. I am willing to defer to others if they have an opinion.

https://github.com/llvm/llvm-project/pull/88287
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [ARM] Armv8-R does not require fp64 or neon. (PR #88287)

2024-04-21 Thread David Green via cfe-commits

davemgreen wrote:

As far as I understand this will remove the tuning we do for cortex-r52 when 
using armv8r, which will mean a little less performance but the tuning features 
in the Arm backend are not handled as well as they could be.

Can you add release note explaining what will change? Thanks.

https://github.com/llvm/llvm-project/pull/88287
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [ARM] Armv8-R does not require fp64 or neon. (PR #88287)

2024-04-18 Thread David Green via cfe-commits

davemgreen wrote:

Does this disable neon by default for cortex-r52? If so I don't think it should 
be doing that, it would be a break in the existing behaviour, and should at 
least be in the release notes.

The general rule for -mcpu options is that they should (roughly) enable the 
maximum set of features available, and from there can be disabled with -mfpu or 
+nofp options. Essentially the default is -mfpu=auto. Keeping to this keeps 
things consistent between cores, so long as people understand the rule. For 
architectures the default should be without, but as far as I understand we 
still default to -mfpu=auto.

https://github.com/llvm/llvm-project/pull/88287
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Cortex-A520AE and Cortex-A720AE CPUs (PR #85401)

2024-03-19 Thread David Green via cfe-commits

https://github.com/davemgreen approved this pull request.

Thanks. LGTM

https://github.com/llvm/llvm-project/pull/85401
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Cortex-A520AE and Cortex-A720AE CPUs (PR #85401)

2024-03-17 Thread David Green via cfe-commits


@@ -58,6 +58,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo 
{
 CortexA55,
 CortexA510,
 CortexA520,
+CortexA520AE,

davemgreen wrote:

These might not be worth adding, considering they should be the same as 
CortexA520, and could reuse the same enum.

https://github.com/llvm/llvm-project/pull/85401
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64] Add support for Cortex-A520AE and Cortex-A720AE CPUs (PR #85401)

2024-03-17 Thread David Green via cfe-commits


@@ -67,6 +67,8 @@ Changes to Interprocedural Optimizations
 Changes to the AArch64 Backend
 --
 
+* Added support for Cortex-A520AE and Cortex-A720AE CPUs.

davemgreen wrote:

Could this have Cortex-A78AE too?

https://github.com/llvm/llvm-project/pull/85401
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang][AArch64] Enable fp128 for aarch64 linux target (PR #85070)

2024-03-14 Thread David Green via cfe-commits

https://github.com/davemgreen commented:

Hi - I think this looks sensible, considering that long double == fp128. Should 
we be doing the same for other OS's in this file too? 

https://github.com/llvm/llvm-project/pull/85070
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)

2024-02-22 Thread David Green via cfe-commits

davemgreen wrote:

I would also personally add Armv6 too for consistency, but don't have a strong 
opinion.

https://github.com/llvm/llvm-project/pull/82400
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)

2024-02-22 Thread David Green via cfe-commits


@@ -305,6 +305,17 @@ X86 Support
 Arm and AArch64 Support
 ^^^
 
+- ARMv7+ targets now default to allowing unaligned access, except Armv6-M, and
+  Armv8-M without the Main Extension. Baremetal targets should check that the
+  new default will work with their system configurations, since it requires
+  that SCTLR.A is 0, SCTLR.U is 1, and that the memory in question is
+  configured as "normal" memory. We've made the value judgment that the
+  performance gains here outweigh breakages, since it is difficult to identify
+  performance loss from disabling unaligned access, but incorrect enabling
+  unaligned access will generate an obvious alignment fault on ARMv7+. This is
+  also the default setting for ARM's downstream compilers. We have not changed
+  the default behavior for ARMv6, but may revisit that decision in the future.

davemgreen wrote:

I would up-play the compatibility argument and downplay the judgement call a 
little. And mention the way to disable it. Maybe something like
```
- ARMv7+ targets now default to allowing unaligned access, except Armv6-M, and
  Armv8-M without the Main Extension. Baremetal targets should check that the
  new default will work with their system configurations, since it requires
  that SCTLR.A is 0, SCTLR.U is 1, and that the memory in question is
  configured as "normal" memory. This brings clang in-line with the default 
settings
  for GCC and Arm Compiler. The old behaviour can be restored with
  -mno-unaligned-access.
```
But you might want to re-add the reasoning about the performance/codesize loss.

https://github.com/llvm/llvm-project/pull/82400
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)

2024-02-22 Thread David Green via cfe-commits


@@ -22,6 +22,12 @@
 // RUN: %clang -target armv7-windows -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-ARM < %t %s
 
+// RUN: %clang --target=armv6 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ALIGNED-ARM < %t %s
+
+// RUN: %clang --target=armv7 -### %s 2> %t

davemgreen wrote:

Can you add some extra tests for things like 8-m.main, 8-m.base and 6-m, to 
make sure we have coverage?

https://github.com/llvm/llvm-project/pull/82400
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)

2024-02-22 Thread David Green via cfe-commits

https://github.com/davemgreen edited 
https://github.com/llvm/llvm-project/pull/82400
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)

2024-02-22 Thread David Green via cfe-commits

https://github.com/davemgreen commented:

> Unaligned accesses require that SCTLR.A is 0, SCTLR.U is 1, and that the 
> memory in question is configured as "normal" memory. Almost all operating 
> systems do in fact configure their registers/memory this way, but on 
> baremetal it's not really a safe assumption. Changing the default here is 
> basically guaranteed to break someone's code.
> 
> We could make the value judgement that the performance gain outweighs 
> breaking someone's code. Disabled unaligned access is a performance hit users 
> have a hard time discovering; incorrectly enabled unaligned access will 
> generate an obvious alignment fault on v7 and up.
> 
> On pre-v7 processors, you can get the old "rotate" behavior instead of a 
> fault. This is controlled by SCTLR.U on v6, but I don't think there's any 
> reason to expect the bit is configured the "right" way on baremetal. So 
> changing the default for v6 is a bit more dubious.
> 
> The tradeoffs might be a bit different for M-class processors; I think 
> unaligned access works by default there (except for v6m and v8m.baseline).
> 
> This change needs a release note.

The issue that we have found is that as both GCC and Arm Compiler default to 
-munaligned-access, users expect it to be the default. They only notice the 
much bigger codesize/worse performance and don't understand the reason without 
a lot of digging. You are certainly right that someone who has been using clang 
bare metal in the past might hit problems with the new default, but there is a 
high chance they are just using the wrong option without noticing. And I 
believe aligning with GCC/Arm Compiler is a better default going forward, as 
more people start using Clang in bare metal. Hopefully the release note can at 
least make it clear.

M-Profile needs a bit set in the bootcode, IIRC.

https://github.com/llvm/llvm-project/pull/82400
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)

2024-02-20 Thread David Green via cfe-commits


@@ -895,19 +895,17 @@ llvm::ARM::FPUKind arm::getARMTargetFeatures(const Driver 
,
 // defaults this bit to 0 and handles it as a system-wide (not
 // per-process) setting. It is therefore safe to assume that ARMv7+
 // Linux targets support unaligned accesses. The same goes for NaCl
-// and Windows.
-//
-// The above behavior is consistent with GCC.
+// and Windows. However, ARM's forks of GCC and Clang both allow
+// unaligned accesses by default for all targets. We follow this
+// behavior and enable unaligned accesses by default for ARMv7+ targets.
+// Users can disable behavior via compiler options (-mno-unaliged-access).
+// See https://github.com/llvm/llvm-project/issues/59560 for more info.

davemgreen wrote:

Also, I would drop the reference to the github issue from the comment. People 
can inspect the git history if they need to go looking for a reason behind it. 

https://github.com/llvm/llvm-project/pull/82400
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang][driver] Allow unaligned access on ARMv7 and higher by default (PR #82400)

2024-02-20 Thread David Green via cfe-commits

davemgreen wrote:

Hi - I like the change. We have this code in the downstream compiler, which 
also enables this for Armv6, but specifically disables it for v6m and 
v8m.baseline.
```
  if (VersionNum < 6 ||
  Triple.getSubArch() == llvm::Triple::SubArchType::ARMSubArch_v6m ||
  Triple.getSubArch() == 
llvm::Triple::SubArchType::ARMSubArch_v8m_baseline) {
Features.push_back("+strict-align");
  }
```

I don't have a strong opinion about what happens with ARMv6, but this deserves 
a release note.

https://github.com/llvm/llvm-project/pull/82400
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [ARM] __ARM_ARCH macro definition fix (PR #81493)

2024-02-13 Thread David Green via cfe-commits

davemgreen wrote:

I'm a little worried people might be relying on the existing behaviour, with 
both clang and GCC having this wrong for a while. If we are going to do it can 
you add a release note to clang explaining the new behaviour?

https://github.com/llvm/llvm-project/pull/81493
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang-tools-extra] [clang] [AArch64] Implement -fno-plt for SelectionDAG/GlobalISel (PR #78890)

2024-02-01 Thread David Green via cfe-commits


@@ -201,17 +201,27 @@ define dso_local void @rv_marker_3() personality ptr 
@__gxx_personality_v0 {
 ; GISEL-NEXT:bl _objc_object
 ; GISEL-NEXT:  Ltmp1:
 ; GISEL-NEXT:  ; %bb.1: ; %invoke.cont
-; GISEL-NEXT:ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:  Lloh0:
+; GISEL-NEXT:adrp x1, _objc_release@GOTPAGE
 ; GISEL-NEXT:mov x0, x19
+; GISEL-NEXT:  Lloh1:
+; GISEL-NEXT:ldr x1, [x1, _objc_release@GOTPAGEOFF]
+; GISEL-NEXT:ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; GISEL-NEXT:ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
-; GISEL-NEXT:b _objc_release
+; GISEL-NEXT:br x1

davemgreen wrote:

@fhahn, @TNorthover do these sound OK to you?

https://github.com/llvm/llvm-project/pull/78890
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [clang-tools-extra] [compiler-rt] [llvm] [flang] [TTI]Fallback to SingleSrcPermute shuffle kind, if no direct estimation for (PR #79837)

2024-02-01 Thread David Green via cfe-commits

davemgreen wrote:

I think this is probably OK for Arm & AArch64. In the long run we should 
ideally be adding better extract subvector costs, but this patch moves the cost 
in that direction.

https://github.com/llvm/llvm-project/pull/79837
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[flang] [compiler-rt] [clang] [llvm] [mlir] [clang-tools-extra] [lldb] [libc] [libcxx] [AArch64] add intrinsic to generate a bfi instruction (PR #79672)

2024-01-31 Thread David Green via cfe-commits

davemgreen wrote:

I see. The issue is that the opposite is often true as well - if we add a 
target specific intrinsic for this then, whilst we get a single instruction 
being emitted, we don't see all the other optimizations that the compiler can 
and should be performing.

Things like constant folding, combining into other instructions, known-bits 
analysis or any form of vectorization will all be blocked by the intrinsic. It 
can take quite some work to add all those features in (if they are possible), 
and without them can potentially lead to worse results. Plus more things to 
maintain.

BFI isn't a trivial instructions to match as it involves certain masks and 
shifts. There might certainly be advantages to having an intrinsic. I would 
like to try and see what the problems would be with generated code using normal 
operations first though, if we can. If there are optimizations we can make 
based on the existing code then that would help in all cases (c, mlir, rust, 
etc), not just frontends that are producing the intrinsics.

https://github.com/llvm/llvm-project/pull/79672
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AArch64][TargetParser] Add mcpu alias for Microsoft Azure Cobalt 100. (PR #79614)

2024-01-29 Thread David Green via cfe-commits

https://github.com/davemgreen approved this pull request.

Thanks. LGTM too.

https://github.com/llvm/llvm-project/pull/79614
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [compiler-rt] [libc] [flang] [mlir] [libcxx] [lldb] [llvm] [clang] [AArch64] add intrinsic to generate a bfi instruction (PR #79672)

2024-01-29 Thread David Green via cfe-commits

davemgreen wrote:

OK. We would not usually add intrinsics like this without a strong motivating 
case, that could not be optimized in some other way. It is better to use target 
independent options when available, and inline assembly is available as a 
fallback if it is really needed. But I would recommend that they use normal 
and/or/shift operations and let us know about places the compiler isn't 
optimizing them as well as it could be.

https://github.com/llvm/llvm-project/pull/79672
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libc] [clang-tools-extra] [lldb] [flang] [mlir] [llvm] [clang] [compiler-rt] [libcxx] [AArch64] add intrinsic to generate a bfi instruction (PR #79672)

2024-01-28 Thread David Green via cfe-commits

davemgreen wrote:

Hello. Can you explain why this is needed, as opposed to using the equivalent 
shift/and/ors?

https://github.com/llvm/llvm-project/pull/79672
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AArch64][TargetParser] Add mcpu alias for Microsoft Azure Cobalt 100. (PR #79614)

2024-01-26 Thread David Green via cfe-commits

davemgreen wrote:

It looks like this needs to update testAArch64CPUArchList too. Otherwise it LGTM

https://github.com/llvm/llvm-project/pull/79614
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [TargetParser] Define AEK_FCMA and AEK_JSCVT for tsv110 (PR #75516)

2024-01-14 Thread David Green via cfe-commits

https://github.com/davemgreen approved this pull request.

https://github.com/ARM-software/acle/pull/279 was committed recently, where I 
think this lines up with the final version of it. I think this LGTM in that 
case.

https://github.com/llvm/llvm-project/pull/75516
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [llvm] [clang] [AArch64][SVE2] Lower OR to SLI/SRI (PR #77555)

2024-01-11 Thread David Green via cfe-commits

https://github.com/davemgreen approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/77555
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)

2024-01-08 Thread David Green via cfe-commits

https://github.com/davemgreen approved this pull request.

Thanks. LGTM

https://github.com/llvm/llvm-project/pull/75440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2024-01-08 Thread David Green via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char 

[llvm] [clang] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2024-01-08 Thread David Green via cfe-commits

https://github.com/davemgreen approved this pull request.

Thanks for the updates. From what I can tell this LGTM, but it will need a 
rebase.

You might want to commit it with the option disabled, and then flip the switch 
in a followup to avoid the commit-revert cycles in case there are any issues.

https://github.com/llvm/llvm-project/pull/72273
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang-tools-extra] [clang] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2024-01-08 Thread David Green via cfe-commits

https://github.com/davemgreen edited 
https://github.com/llvm/llvm-project/pull/72273
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)

2024-01-07 Thread David Green via cfe-commits

davemgreen wrote:

If you can make armv9-a work the same as armv8-a and add some tests for it then 
this LGTM

https://github.com/llvm/llvm-project/pull/75440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)

2024-01-07 Thread David Green via cfe-commits


@@ -836,6 +837,70 @@ void ARMTargetInfo::getTargetDefines(const LangOptions 
,
   if (Opts.RWPI)
 Builder.defineMacro("__ARM_RWPI", "1");
 
+  // Macros for enabling co-proc intrinsics
+  uint64_t FeatureCoprocBF = 0;
+  switch (ArchKind) {
+  default:
+break;
+  case llvm::ARM::ArchKind::ARMV4:
+// Filter __arm_ldcl and __arm_stcl in acle.h
+FeatureCoprocBF = FEATURE_COPROC_B1;
+break;
+  case llvm::ARM::ArchKind::ARM5T:
+FeatureCoprocBF = isThumb() ? 0 : FEATURE_COPROC_B1;
+break;
+  case llvm::ARM::ArchKind::ARMV5TE:
+  case llvm::ARM::ArchKind::ARMV5TEJ:
+if (!isThumb())
+  FeatureCoprocBF =
+  FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | FEATURE_COPROC_B3;
+break;
+  case llvm::ARM::ArchKind::ARMV6:
+  case llvm::ARM::ArchKind::ARMV6K:
+  case llvm::ARM::ArchKind::ARMV6KZ:
+  case llvm::ARM::ArchKind::ARMV6T2:
+if (!isThumb() || ArchKind == llvm::ARM::ArchKind::ARMV6T2)
+  FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 |
+FEATURE_COPROC_B3 | FEATURE_COPROC_B4;
+break;
+  case llvm::ARM::ArchKind::ARMV7A:
+  case llvm::ARM::ArchKind::ARMV7R:
+  case llvm::ARM::ArchKind::ARMV7M:
+  case llvm::ARM::ArchKind::ARMV7S:
+  case llvm::ARM::ArchKind::ARMV7EM:
+FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 |
+  FEATURE_COPROC_B3 | FEATURE_COPROC_B4;
+break;
+  case llvm::ARM::ArchKind::ARMV8A:
+  case llvm::ARM::ArchKind::ARMV8R:
+  case llvm::ARM::ArchKind::ARMV8_1A:
+  case llvm::ARM::ArchKind::ARMV8_2A:
+  case llvm::ARM::ArchKind::ARMV8_3A:
+  case llvm::ARM::ArchKind::ARMV8_4A:
+  case llvm::ARM::ArchKind::ARMV8_5A:
+  case llvm::ARM::ArchKind::ARMV8_6A:
+  case llvm::ARM::ArchKind::ARMV8_7A:
+  case llvm::ARM::ArchKind::ARMV8_8A:
+  case llvm::ARM::ArchKind::ARMV8_9A:
+// Filter __arm_cdp, __arm_ldcl, __arm_stcl in arm_acle.h
+FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B3;
+break;
+  case llvm::ARM::ArchKind::ARMV8MMainline:
+FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 |
+  FEATURE_COPROC_B3 | FEATURE_COPROC_B4;
+break;
+  case llvm::ARM::ArchKind::ARMV9A:
+  case llvm::ARM::ArchKind::ARMV9_1A:
+  case llvm::ARM::ArchKind::ARMV9_2A:
+  case llvm::ARM::ArchKind::ARMV9_3A:
+  case llvm::ARM::ArchKind::ARMV9_4A:

davemgreen wrote:

Oh right, ARMV9_5A is AArch64 only. That's OK then.
I would expect the other ArmV9-A cases to be the same as ArmV8-A for AArch32, 
and wouldn't have expected a change in coprocessor instructions.
The reference manual is at 
https://developer.arm.com/documentation/ddi0487/ja/?lang=en and doesn't seem to 
mention cdp.

https://github.com/llvm/llvm-project/pull/75440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)

2024-01-02 Thread David Green via cfe-commits

https://github.com/davemgreen edited 
https://github.com/llvm/llvm-project/pull/75440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)

2024-01-02 Thread David Green via cfe-commits


@@ -756,6 +756,58 @@ __arm_st64bv0(void *__addr, data512_t __value) {
   __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
 #endif
 
+/* Coprocessor Intrinsics */
+#if defined(__ARM_FEATURE_COPROC)
+
+#if (__ARM_FEATURE_COPROC & 0x1)
+
+#if (__ARM_ARCH != 8)

davemgreen wrote:

Could this be < 8?
This doesn't apply to 8-m.main, right? The test looks OK.

https://github.com/llvm/llvm-project/pull/75440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)

2024-01-02 Thread David Green via cfe-commits


@@ -836,6 +837,70 @@ void ARMTargetInfo::getTargetDefines(const LangOptions 
,
   if (Opts.RWPI)
 Builder.defineMacro("__ARM_RWPI", "1");
 
+  // Macros for enabling co-proc intrinsics
+  uint64_t FeatureCoprocBF = 0;
+  switch (ArchKind) {
+  default:
+break;
+  case llvm::ARM::ArchKind::ARMV4:
+// Filter __arm_ldcl and __arm_stcl in acle.h
+FeatureCoprocBF = FEATURE_COPROC_B1;
+break;
+  case llvm::ARM::ArchKind::ARM5T:
+FeatureCoprocBF = isThumb() ? 0 : FEATURE_COPROC_B1;
+break;
+  case llvm::ARM::ArchKind::ARMV5TE:
+  case llvm::ARM::ArchKind::ARMV5TEJ:
+if (!isThumb())
+  FeatureCoprocBF =
+  FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | FEATURE_COPROC_B3;
+break;
+  case llvm::ARM::ArchKind::ARMV6:
+  case llvm::ARM::ArchKind::ARMV6K:
+  case llvm::ARM::ArchKind::ARMV6KZ:
+  case llvm::ARM::ArchKind::ARMV6T2:
+if (!isThumb() || ArchKind == llvm::ARM::ArchKind::ARMV6T2)
+  FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 |
+FEATURE_COPROC_B3 | FEATURE_COPROC_B4;
+break;
+  case llvm::ARM::ArchKind::ARMV7A:
+  case llvm::ARM::ArchKind::ARMV7R:
+  case llvm::ARM::ArchKind::ARMV7M:
+  case llvm::ARM::ArchKind::ARMV7S:
+  case llvm::ARM::ArchKind::ARMV7EM:
+FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 |
+  FEATURE_COPROC_B3 | FEATURE_COPROC_B4;
+break;
+  case llvm::ARM::ArchKind::ARMV8A:
+  case llvm::ARM::ArchKind::ARMV8R:
+  case llvm::ARM::ArchKind::ARMV8_1A:
+  case llvm::ARM::ArchKind::ARMV8_2A:
+  case llvm::ARM::ArchKind::ARMV8_3A:
+  case llvm::ARM::ArchKind::ARMV8_4A:
+  case llvm::ARM::ArchKind::ARMV8_5A:
+  case llvm::ARM::ArchKind::ARMV8_6A:
+  case llvm::ARM::ArchKind::ARMV8_7A:
+  case llvm::ARM::ArchKind::ARMV8_8A:
+  case llvm::ARM::ArchKind::ARMV8_9A:
+// Filter __arm_cdp, __arm_ldcl, __arm_stcl in arm_acle.h
+FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B3;
+break;
+  case llvm::ARM::ArchKind::ARMV8MMainline:
+FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 |
+  FEATURE_COPROC_B3 | FEATURE_COPROC_B4;
+break;
+  case llvm::ARM::ArchKind::ARMV9A:
+  case llvm::ARM::ArchKind::ARMV9_1A:
+  case llvm::ARM::ArchKind::ARMV9_2A:
+  case llvm::ARM::ArchKind::ARMV9_3A:
+  case llvm::ARM::ArchKind::ARMV9_4A:

davemgreen wrote:

There is a ARMV9_5A now too. I think I would expect these to be the same as 
ARMV8.
Is this switch statement exhaustive? Could the default case be made the same as 
ARMV8 so we don't need to extend it every time an architecture is added?

https://github.com/llvm/llvm-project/pull/75440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)

2024-01-02 Thread David Green via cfe-commits


@@ -836,6 +837,70 @@ void ARMTargetInfo::getTargetDefines(const LangOptions 
,
   if (Opts.RWPI)
 Builder.defineMacro("__ARM_RWPI", "1");
 
+  // Macros for enabling co-proc intrinsics
+  uint64_t FeatureCoprocBF = 0;
+  switch (ArchKind) {
+  default:
+break;
+  case llvm::ARM::ArchKind::ARMV4:
+// Filter __arm_ldcl and __arm_stcl in acle.h
+FeatureCoprocBF = FEATURE_COPROC_B1;
+break;
+  case llvm::ARM::ArchKind::ARM5T:
+FeatureCoprocBF = isThumb() ? 0 : FEATURE_COPROC_B1;
+break;
+  case llvm::ARM::ArchKind::ARMV5TE:
+  case llvm::ARM::ArchKind::ARMV5TEJ:
+if (!isThumb())
+  FeatureCoprocBF =
+  FEATURE_COPROC_B1 | FEATURE_COPROC_B2 | FEATURE_COPROC_B3;
+break;
+  case llvm::ARM::ArchKind::ARMV6:
+  case llvm::ARM::ArchKind::ARMV6K:
+  case llvm::ARM::ArchKind::ARMV6KZ:
+  case llvm::ARM::ArchKind::ARMV6T2:
+if (!isThumb() || ArchKind == llvm::ARM::ArchKind::ARMV6T2)
+  FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 |
+FEATURE_COPROC_B3 | FEATURE_COPROC_B4;
+break;
+  case llvm::ARM::ArchKind::ARMV7A:
+  case llvm::ARM::ArchKind::ARMV7R:
+  case llvm::ARM::ArchKind::ARMV7M:
+  case llvm::ARM::ArchKind::ARMV7S:
+  case llvm::ARM::ArchKind::ARMV7EM:
+FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B2 |
+  FEATURE_COPROC_B3 | FEATURE_COPROC_B4;
+break;
+  case llvm::ARM::ArchKind::ARMV8A:
+  case llvm::ARM::ArchKind::ARMV8R:
+  case llvm::ARM::ArchKind::ARMV8_1A:
+  case llvm::ARM::ArchKind::ARMV8_2A:
+  case llvm::ARM::ArchKind::ARMV8_3A:
+  case llvm::ARM::ArchKind::ARMV8_4A:
+  case llvm::ARM::ArchKind::ARMV8_5A:
+  case llvm::ARM::ArchKind::ARMV8_6A:
+  case llvm::ARM::ArchKind::ARMV8_7A:
+  case llvm::ARM::ArchKind::ARMV8_8A:
+  case llvm::ARM::ArchKind::ARMV8_9A:
+// Filter __arm_cdp, __arm_ldcl, __arm_stcl in arm_acle.h
+FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B3;
+break;
+  case llvm::ARM::ArchKind::ARMV8MMainline:

davemgreen wrote:

Add ARMV8_1MMainline too.

https://github.com/llvm/llvm-project/pull/75440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)

2024-01-02 Thread David Green via cfe-commits

https://github.com/davemgreen commented:

Thanks. This is looking good to me. I just have a few comments about different 
architecture revisions.

https://github.com/llvm/llvm-project/pull/75440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)

2023-12-22 Thread David Green via cfe-commits

davemgreen wrote:

Thanks for doing this.
I think that __ARM_FEATURE_COPROC should be a bitfield, as defined in 
https://arm-software.github.io/acle/main/acle.html#coprocessor-intrinsics. That 
would remove the need for the other macros.

https://github.com/llvm/llvm-project/pull/75440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)

2023-12-21 Thread David Green via cfe-commits

davemgreen wrote:

This is the downstream code we have: 
https://gist.github.com/davemgreen/e7ade833274a60e975e67a66eda7cb44
Note that the __ARM_TARGET_COPROC_XYZ macros are probably wrong. They should be 
__ARM_FEATURE_COPROC bitfield macros according to the ACLE.

Can you make use of some of that? It would be good to add the macro definition 
at the same time as the intrinsics (they can be used to control when the 
intrinsics are available), and the test should be useful for checking they are 
available at the right times.

https://github.com/llvm/llvm-project/pull/75440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)

2023-12-21 Thread David Green via cfe-commits

davemgreen wrote:

Let me try and get the downstream version, you might be able to pick up some 
things from it. A test at least should probably be present.

https://github.com/llvm/llvm-project/pull/75440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang-tools-extra] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char 

[llvm] [clang-tools-extra] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char 

[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char 

[llvm] [clang-tools-extra] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char 

[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char 

[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char 

[clang-tools-extra] [llvm] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char 

[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char 

[llvm] [clang-tools-extra] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char 

[clang] [clang-tools-extra] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits

https://github.com/davemgreen commented:

Thanks. I think it is worth trying to get this in. I already see it triggering 
in a number of places, it might be worth working on making it a little more 
generic in followup patches if we can, but there is already quite a bit going 
on.

https://github.com/llvm/llvm-project/pull/72273
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang-tools-extra] [clang] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char 

[clang-tools-extra] [clang] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits


@@ -0,0 +1,816 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Instruction *Index,
+Value *Start, Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+PHINode *IndPhi, Value *MaxLen, Instruction *Index,
+Value *Start, bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char 

[clang] [llvm] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-19 Thread David Green via cfe-commits

https://github.com/davemgreen edited 
https://github.com/llvm/llvm-project/pull/72273
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [ARM] arm_acle.h add Coprocessor Instrinsics (PR #75440)

2023-12-18 Thread David Green via cfe-commits

davemgreen wrote:

It looks like there is a downstream implementation of this that was never 
upstreamed. Perhaps someone can fish it out for you to show how it looked? It 
might be using the wrong predefined macro, but does have some tests.

https://github.com/llvm/llvm-project/pull/75440
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [TargetParser] Define AEK_FCMA and AEK_JSCVT for tsv110 (PR #75516)

2023-12-18 Thread David Green via cfe-commits


@@ -81,6 +81,15 @@ static bool DecodeAArch64Features(const Driver , StringRef 
text,
 else
   return false;
 
+// +jsconv and +complxnum implies +neon and +fp-armv8

davemgreen wrote:

I believe this ideally would not be in the driver, as it does not apply to 
target attributes, only -march options.

https://github.com/llvm/llvm-project/pull/75516
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang-tools-extra] [LoopVectorize] Enable hoisting of runtime checks by default (PR #71538)

2023-12-15 Thread David Green via cfe-commits

https://github.com/davemgreen approved this pull request.

With that fixed, and from the perf Ive seen, this LGTM. Thanks

https://github.com/llvm/llvm-project/pull/71538
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[openmp] [clang-tools-extra] [libcxx] [mlir] [clang] [compiler-rt] [lldb] [llvm] [libcxxabi] [flang] [MachineCopyPropagation] When the source of PreviousCopy is undef, we cannot replace sub register (

2023-12-13 Thread David Green via cfe-commits

davemgreen wrote:

Thanks. It sounds like there are not a lot of code changes, which is a good 
sign. I didn't expect the debug problems though.

I'll try and take a look at the patch. Perhaps you are right that we need a new 
method for the debug info to use.

https://github.com/llvm/llvm-project/pull/74682
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [clang-tools-extra] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-13 Thread David Green via cfe-commits


@@ -0,0 +1,839 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Value *Start,
+Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+Value *MaxLen, Value *Index, Value *Start,
+bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char AArch64LoopIdiomTransformLegacyPass::ID = 0;

[clang-tools-extra] [clang] [llvm] [AArch64] Add an AArch64 pass for loop idiom transformations (PR #72273)

2023-12-13 Thread David Green via cfe-commits


@@ -0,0 +1,839 @@
+//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition 
-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+//
+// This pass implements a pass that recognizes certain loop idioms and
+// transforms them into more optimized versions of the same loop. In cases
+// where this happens, it can be a significant performance win.
+//
+// We currently only recognize one loop that finds the first mismatched byte
+// in an array and returns the index, i.e. something like:
+//
+//  while (++i != n) {
+//if (a[i] != b[i])
+//  break;
+//  }
+//
+// In this example we can actually vectorize the loop despite the early exit,
+// although the loop vectorizer does not support it. It requires some extra
+// checks to deal with the possibility of faulting loads when crossing page
+// boundaries. However, even with these checks it is still profitable to do the
+// transformation.
+//
+//===--===//
+//
+// TODO List:
+//
+// * When optimizing for code size we may want to avoid some transformations.
+// * We can also support the inverse case where we scan for a matching element.
+//
+//===--===//
+
+#include "AArch64LoopIdiomTransform.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-idiom-transform"
+
+static cl::opt
+DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false),
+   cl::desc("Disable AArch64 Loop Idiom Transform Pass."));
+
+static cl::opt DisableByteCmp(
+"disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false),
+cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do "
+ "not convert byte-compare loop(s)."));
+
+static cl::opt VerifyLoops(
+"aarch64-lit-verify", cl::Hidden, cl::init(false),
+cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass."));
+
+namespace llvm {
+
+void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
+Pass *createAArch64LoopIdiomTransformPass();
+
+} // end namespace llvm
+
+namespace {
+
+class AArch64LoopIdiomTransform {
+  Loop *CurLoop = nullptr;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI,
+ const TargetTransformInfo *TTI,
+ const DataLayout *DL)
+  : DT(DT), LI(LI), TTI(TTI), DL(DL) {}
+
+  bool run(Loop *L);
+
+private:
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+  SmallVectorImpl );
+
+  bool recognizeByteCompare();
+  Value *expandFindMismatch(IRBuilder<> , GetElementPtrInst *GEPA,
+GetElementPtrInst *GEPB, Value *Start,
+Value *MaxLen);
+  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
+Value *MaxLen, Value *Index, Value *Start,
+bool IncIdx, BasicBlock *FoundBB,
+BasicBlock *EndBB);
+  /// @}
+};
+
+class AArch64LoopIdiomTransformLegacyPass : public LoopPass {
+public:
+  static char ID;
+
+  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) {
+initializeAArch64LoopIdiomTransformLegacyPassPass(
+*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+return "Transform AArch64-specific loop idioms";
+  }
+
+  void getAnalysisUsage(AnalysisUsage ) const override {
+AU.addRequired();
+AU.addRequired();
+AU.addRequired();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager ) override;
+};
+
+bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L,
+LPPassManager ) {
+
+  if (skipLoop(L))
+return false;
+
+  auto *DT = ().getDomTree();
+  auto *LI = ().getLoopInfo();
+  auto  = getAnalysis().getTTI(
+  *L->getHeader()->getParent());
+  return AArch64LoopIdiomTransform(
+ DT, LI, , >getHeader()->getModule()->getDataLayout())
+  .run(L);
+}
+
+} // end anonymous namespace
+
+char AArch64LoopIdiomTransformLegacyPass::ID = 0;

  1   2   >