https://github.com/llvmbot created 
https://github.com/llvm/llvm-project/pull/177974

Backport fab06fae0064a2f1208331f9c355a26a4f9777f0

Requested by: @nikic

>From 385da9b30a487a6ec744896ffaf70b48e6f487f0 Mon Sep 17 00:00:00 2001
From: Croose <[email protected]>
Date: Fri, 16 Jan 2026 12:06:20 +0000
Subject: [PATCH] [ARM] Fix inlining issue in ARM (#169337)

There is an issue on ARM where a function wont be inlined due to
mismatching target features between caller and callee.
The caller has `HasV8Ops` and `FeatureDotProd` and the callee does not,
but AFAIK this should not be a problem.
https://godbolt.org/z/f19h3zT66 is an example showing how the call is
not inlined on armv7.
The expected asm output would be something like:
```asm
.fnstart
        vsdot.s8        q0, q1, d4[0]
        bx      lr
.Lfunc_end0:

```
Thanks to @Amichaxx we managed to narrow it down and now can resolve
this problem by adding `ARM::FeatureDotProd, ARM::HasV8Ops` to
InlineFeaturesAllowed in llvm/lib/Target/ARM/ARMTargetTransformInfo.h,
after which the inlining occurs successfully.

Whilst we're at it we have also added some debugging to make it easier
to tell why (or why not) a function is being inlined for ARM, and a
couple other features that seem to be missing from the list.

This patch was motivated by an issue experienced with rust that was
traced back to llvm, and thus was designed to address that.

(cherry picked from commit fab06fae0064a2f1208331f9c355a26a4f9777f0)
---
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |  49 ++++++
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h  | 166 ++++++++++++++----
 .../Transforms/Inline/ARM/inline-dotprod.ll   |  35 ++++
 3 files changed, 216 insertions(+), 34 deletions(-)
 create mode 100644 llvm/test/Transforms/Inline/ARM/inline-dotprod.ll

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp 
b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 88a7fb185bf16..b947c8a10e2d8 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -107,6 +107,55 @@ bool ARMTTIImpl::areInlineCompatible(const Function 
*Caller,
   // the callers'.
   bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
                      (CalleeBits & InlineFeaturesAllowed);
+
+  LLVM_DEBUG({
+    if (!MatchExact || !MatchSubset) {
+      dbgs() << "=== Inline compatibility debug ===\n";
+      dbgs() << "Caller: " << Caller->getName() << "\n";
+      dbgs() << "Callee: " << Callee->getName() << "\n";
+
+      // Bit diffs
+      FeatureBitset MissingInCaller = CalleeBits & ~CallerBits; // callee-only
+      FeatureBitset ExtraInCaller = CallerBits & ~CalleeBits;   // caller-only
+
+      // Counts
+      dbgs() << "Only-in-caller bit count: " << ExtraInCaller.count() << "\n";
+      dbgs() << "Only-in-callee bit count: " << MissingInCaller.count() << 
"\n";
+
+      dbgs() << "Only-in-caller feature indices [";
+      {
+        bool First = true;
+        for (size_t I = 0, E = ExtraInCaller.size(); I < E; ++I) {
+          if (ExtraInCaller.test(I)) {
+            if (!First)
+              dbgs() << ", ";
+            dbgs() << I;
+            First = false;
+          }
+        }
+      }
+      dbgs() << "]\n";
+
+      dbgs() << "Only-in-callee feature indices [";
+      {
+        bool First = true;
+        for (size_t I = 0, E = MissingInCaller.size(); I < E; ++I) {
+          if (MissingInCaller.test(I)) {
+            if (!First)
+              dbgs() << ", ";
+            dbgs() << I;
+            First = false;
+          }
+        }
+      }
+      dbgs() << "]\n";
+
+      // Indices map to features as found in
+      // llvm-project/(your_build)/lib/Target/ARM/ARMGenSubtargetInfo.inc
+      dbgs() << "MatchExact=" << (MatchExact ? "true" : "false")
+             << " MatchSubset=" << (MatchSubset ? "true" : "false") << "\n";
+    }
+  });
   return MatchExact && MatchSubset;
 }
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h 
b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index a23256364dd9a..fafd2d44a818c 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -40,13 +40,13 @@ class Type;
 class Value;
 
 namespace TailPredication {
-  enum Mode {
-    Disabled = 0,
-    EnabledNoReductions,
-    Enabled,
-    ForceEnabledNoReductions,
-    ForceEnabled
-  };
+enum Mode {
+  Disabled = 0,
+  EnabledNoReductions,
+  Enabled,
+  ForceEnabledNoReductions,
+  ForceEnabled
+};
 }
 
 // For controlling conversion of memcpy into Tail Predicated loop.
@@ -64,37 +64,135 @@ class ARMTTIImpl final : public 
BasicTTIImplBase<ARMTTIImpl> {
   const ARMTargetLowering *TLI;
 
   // Currently the following features are excluded from InlineFeaturesAllowed.
-  // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureFP64, FeatureD32
+  // ModeThumb, FeatureNoARM, ModeSoftFloat.
   // Depending on whether they are set or unset, different
   // instructions/registers are available. For example, inlining a callee with
   // -thumb-mode in a caller with +thumb-mode, may cause the assembler to
   // fail if the callee uses ARM only instructions, e.g. in inline asm.
-  const FeatureBitset InlineFeaturesAllowed = {
-      ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, ARM::FeatureThumb2,
-      ARM::FeatureFP16, ARM::FeatureVFP4, ARM::FeatureFPARMv8,
-      ARM::FeatureFullFP16, ARM::FeatureFP16FML, ARM::FeatureHWDivThumb,
-      ARM::FeatureHWDivARM, ARM::FeatureDB, ARM::FeatureV7Clrex,
-      ARM::FeatureAcquireRelease, ARM::FeatureSlowFPBrcc,
-      ARM::FeaturePerfMon, ARM::FeatureTrustZone, ARM::Feature8MSecExt,
-      ARM::FeatureCrypto, ARM::FeatureCRC, ARM::FeatureRAS,
-      ARM::FeatureFPAO, ARM::FeatureFuseAES, ARM::FeatureZCZeroing,
-      ARM::FeatureProfUnpredicate, ARM::FeatureSlowVGETLNi32,
-      ARM::FeatureSlowVDUP32, ARM::FeaturePreferVMOVSR,
-      ARM::FeaturePrefISHSTBarrier, ARM::FeatureMuxedUnits,
-      ARM::FeatureSlowOddRegister, ARM::FeatureSlowLoadDSubreg,
-      ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx,
-      ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs,
-      ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign,
-      ARM::FeatureHasSlowFPVMLx, ARM::FeatureHasSlowFPVFMx,
-      ARM::FeatureVMLxForwarding, ARM::FeaturePref32BitThumb,
-      ARM::FeatureAvoidPartialCPSR, ARM::FeatureCheapPredicableCPSR,
-      ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack,
-      ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP,
-      ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass,
-      ARM::FeatureAClass, ARM::FeatureStrictAlign, ARM::FeatureLongCalls,
-      ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt,
-      ARM::FeatureNoNegativeImmediates
-  };
+  const FeatureBitset InlineFeaturesAllowed = {ARM::Feature8MSecExt,
+                                               ARM::FeatureAClass,
+                                               ARM::FeatureAES,
+                                               ARM::FeatureAcquireRelease,
+                                               ARM::FeatureAvoidMOVsShOp,
+                                               ARM::FeatureAvoidMULS,
+                                               ARM::FeatureAvoidPartialCPSR,
+                                               ARM::FeatureBF16,
+                                               ARM::FeatureCRC,
+                                               ARM::FeatureCheapPredicableCPSR,
+                                               ARM::FeatureCheckVLDnAlign,
+                                               ARM::FeatureCrypto,
+                                               ARM::FeatureD32,
+                                               ARM::FeatureDB,
+                                               ARM::FeatureDFB,
+                                               ARM::FeatureDSP,
+                                               ARM::FeatureDontWidenVMOVS,
+                                               ARM::FeatureDotProd,
+                                               ARM::FeatureExecuteOnly,
+                                               ARM::FeatureExpandMLx,
+                                               ARM::FeatureFP16,
+                                               ARM::FeatureFP16FML,
+                                               ARM::FeatureFP64,
+                                               ARM::FeatureFPAO,
+                                               ARM::FeatureFPARMv8,
+                                               ARM::FeatureFPARMv8_D16,
+                                               ARM::FeatureFPARMv8_D16_SP,
+                                               ARM::FeatureFPARMv8_SP,
+                                               ARM::FeatureFPRegs,
+                                               ARM::FeatureFPRegs16,
+                                               ARM::FeatureFPRegs64,
+                                               ARM::FeatureFullFP16,
+                                               ARM::FeatureFuseAES,
+                                               ARM::FeatureFuseLiterals,
+                                               ARM::FeatureHWDivARM,
+                                               ARM::FeatureHWDivThumb,
+                                               
ARM::FeatureHasNoBranchPredictor,
+                                               ARM::FeatureHasRetAddrStack,
+                                               ARM::FeatureHasSlowFPVFMx,
+                                               ARM::FeatureHasSlowFPVMLx,
+                                               ARM::FeatureHasVMLxHazards,
+                                               ARM::FeatureLOB,
+                                               ARM::FeatureLongCalls,
+                                               ARM::FeatureMClass,
+                                               ARM::FeatureMP,
+                                               
ARM::FeatureMVEVectorCostFactor1,
+                                               
ARM::FeatureMVEVectorCostFactor2,
+                                               
ARM::FeatureMVEVectorCostFactor4,
+                                               ARM::FeatureMatMulInt8,
+                                               ARM::FeatureMuxedUnits,
+                                               ARM::FeatureNEON,
+                                               ARM::FeatureNEONForFP,
+                                               ARM::FeatureNEONForFPMovs,
+                                               ARM::FeatureNoMovt,
+                                               
ARM::FeatureNoNegativeImmediates,
+                                               ARM::FeatureNoPostRASched,
+                                               ARM::FeaturePerfMon,
+                                               ARM::FeaturePref32BitThumb,
+                                               ARM::FeaturePrefISHSTBarrier,
+                                               ARM::FeaturePreferBranchAlign32,
+                                               ARM::FeaturePreferBranchAlign64,
+                                               ARM::FeaturePreferVMOVSR,
+                                               ARM::FeatureProfUnpredicate,
+                                               ARM::FeatureRAS,
+                                               ARM::FeatureRClass,
+                                               ARM::FeatureReserveR9,
+                                               ARM::FeatureSB,
+                                               ARM::FeatureSHA2,
+                                               ARM::FeatureSlowFPBrcc,
+                                               ARM::FeatureSlowLoadDSubreg,
+                                               ARM::FeatureSlowOddRegister,
+                                               ARM::FeatureSlowVDUP32,
+                                               ARM::FeatureSlowVGETLNi32,
+                                               ARM::FeatureSplatVFPToNeon,
+                                               ARM::FeatureStrictAlign,
+                                               ARM::FeatureThumb2,
+                                               ARM::FeatureTrustZone,
+                                               ARM::FeatureUseMIPipeliner,
+                                               ARM::FeatureUseMISched,
+                                               ARM::FeatureUseWideStrideVFP,
+                                               ARM::FeatureV7Clrex,
+                                               ARM::FeatureVFP2,
+                                               ARM::FeatureVFP2_SP,
+                                               ARM::FeatureVFP3,
+                                               ARM::FeatureVFP3_D16,
+                                               ARM::FeatureVFP3_D16_SP,
+                                               ARM::FeatureVFP3_SP,
+                                               ARM::FeatureVFP4,
+                                               ARM::FeatureVFP4_D16,
+                                               ARM::FeatureVFP4_D16_SP,
+                                               ARM::FeatureVFP4_SP,
+                                               ARM::FeatureVMLxForwarding,
+                                               ARM::FeatureVirtualization,
+                                               ARM::FeatureZCZeroing,
+                                               ARM::HasMVEFloatOps,
+                                               ARM::HasMVEIntegerOps,
+                                               ARM::HasV5TEOps,
+                                               ARM::HasV5TOps,
+                                               ARM::HasV6KOps,
+                                               ARM::HasV6MOps,
+                                               ARM::HasV6Ops,
+                                               ARM::HasV6T2Ops,
+                                               ARM::HasV7Ops,
+                                               ARM::HasV8MBaselineOps,
+                                               ARM::HasV8MMainlineOps,
+                                               ARM::HasV8Ops,
+                                               ARM::HasV8_1MMainlineOps,
+                                               ARM::HasV8_1aOps,
+                                               ARM::HasV8_2aOps,
+                                               ARM::HasV8_3aOps,
+                                               ARM::HasV8_4aOps,
+                                               ARM::HasV8_5aOps,
+                                               ARM::HasV8_6aOps,
+                                               ARM::HasV8_7aOps,
+                                               ARM::HasV8_8aOps,
+                                               ARM::HasV8_9aOps,
+                                               ARM::HasV9_0aOps,
+                                               ARM::HasV9_1aOps,
+                                               ARM::HasV9_2aOps,
+                                               ARM::HasV9_3aOps,
+                                               ARM::HasV9_4aOps,
+                                               ARM::HasV9_5aOps,
+                                               ARM::HasV9_6aOps,
+                                               ARM::HasV9_7aOps};
 
   const ARMSubtarget *getST() const { return ST; }
   const ARMTargetLowering *getTLI() const { return TLI; }
diff --git a/llvm/test/Transforms/Inline/ARM/inline-dotprod.ll 
b/llvm/test/Transforms/Inline/ARM/inline-dotprod.ll
new file mode 100644
index 0000000000000..2f8dbb7f01822
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ARM/inline-dotprod.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -mtriple=arm-unknown-linux-gnu -S -passes=inline | FileCheck %s
+; RUN: opt < %s -mtriple=arm-unknown-linux-gnu -S -passes='cgscc(inline)' | 
FileCheck %s
+
+declare i32 @foo(...) #0
+
+define i32 @callee() #0 {
+entry:
+  %call = call i32 (...) @foo()
+  ret i32 %call
+}
+
+define i32 @dotcallee() #1 {
+entry:
+  %call = call i32 (...) @foo()
+  ret i32 %call
+}
+
+define i32 @dotcaller() #1 {
+entry:
+  %call = call i32 @callee()
+  ret i32 %call
+; CHECK-LABEL: dotcaller
+; CHECK: call i32 (...) @foo()
+}
+
+define i32 @caller() #0 {
+entry:
+  %call = call i32 @dotcallee()
+  ret i32 %call
+; CHECK-LABEL: caller
+; CHECK: call i32 @dotcallee()
+}
+
+attributes #0 = { "target-cpu"="generic" "target-features"="+dsp,+neon" }
+attributes #1 = { "target-cpu"="generic" 
"target-features"="+dsp,+neon,+dotprod" }

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to