[clang] [llvm] [AArch64][clang] Fix `__arm_atomic_store_with_stshh` ordering and lowering (PR #191301)

Jonathan Thackray via cfe-commits Thu, 09 Apr 2026 14:29:33 -0700

https://github.com/jthackray created 
https://github.com/llvm/llvm-project/pull/191301


`__builtin_arm_atomic_store_with_stshh` must satisfy two constraints:
  - preserve release/seq_cst ordering in LLVM IR
  - keep `stshh` immediately adjacent to the final store in codegen

The original target-intrinsic lowering preserved the final `stshh` + store
sequence, but it did not model ordering strongly enough in LLVM IR, so the
optimizer could sink earlier stores across the builtin.

Fix this by inserting a `release` or `seq_cst` fence before the intrinsic
call. This preserves ordering in optimized IR while still letting the
backend emit the required final instruction sequence. This means we now
get a `dmb ish` instruction before the `stshh` instruction.

Also relax Sema for the builtin to accept storing 8/16/32/64-bit
floating-point and pointer values in addition to integers, and update
the diagnostic text accordingly.

Add coverage for:
  - integer, floating-point, and pointer codegen cases
  - Sema acceptance/rejection cases
  - optimized-IR ordering regression coverage
  - AArch64 assembly checks for the final release-store sequence

>From 47107a7d2befa33d97d70c8379def3555d3af1b9 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <[email protected]>
Date: Thu, 9 Apr 2026 20:29:29 +0100
Subject: [PATCH] [AArch64][clang] Fix `__arm_atomic_store_with_stshh` ordering
 and lowering

`__builtin_arm_atomic_store_with_stshh` must satisfy two constraints:
  - preserve release/seq_cst ordering in LLVM IR
  - keep `stshh` immediately adjacent to the final store in codegen

The original target-intrinsic lowering preserved the final `stshh` + store
sequence, but it did not model ordering strongly enough in LLVM IR, so the
optimizer could sink earlier stores across the builtin.

Fix this by inserting a `release` or `seq_cst` fence before the intrinsic
call. This preserves ordering in optimized IR while still letting the
backend emit the required final instruction sequence. This means we now
get a `dmb ish` instruction before the `stshh` instruction.

Also relax Sema for the builtin to accept storing 8/16/32/64-bit
floating-point and pointer values in addition to integers, and update
the diagnostic text accordingly.

Add coverage for:
  - integer, floating-point, and pointer codegen cases
  - Sema acceptance/rejection cases
  - optimized-IR ordering regression coverage
  - AArch64 assembly checks for the final release-store sequence
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 +-
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 33 +++++++++++++---
 clang/lib/Sema/SemaARM.cpp                    |  5 ++-
 .../AArch64/pcdphint-atomic-store-order.c     | 37 ++++++++++++++++++
 .../CodeGen/AArch64/pcdphint-atomic-store.c   | 38 +++++++++++++++++++
 .../test/Sema/AArch64/pcdphint-atomic-store.c | 18 ++++++---
 .../CodeGen/AArch64/pcdphint-atomic-store.ll  | 13 +++++++
 7 files changed, 134 insertions(+), 12 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/pcdphint-atomic-store-order.c

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 6d2fae551566f..b3f54df177fd9 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9600,7 +9600,7 @@ def err_atomic_builtin_pointer_size : Error<
   "type (%0 invalid)">;
 def err_arm_atomic_store_with_stshh_bad_type : Error<
   "address argument to '__arm_atomic_store_with_stshh' must be a pointer to an 
"
-  "8,16,32, or 64-bit integer type (%0 invalid)">;
+  "8,16,32, or 64-bit integer, floating-point, or pointer type (%0 invalid)">;
 def err_arm_atomic_store_with_stshh_bad_value_type : Error<
   "value argument to '__arm_atomic_store_with_stshh' must be %0; got %1">;
 def err_arm_atomic_store_with_stshh_bad_order : Error<
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 3e0ec2c143428..60510b2b5d2a2 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -4666,14 +4666,36 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     auto *OrderC = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
     auto *PolicyC = cast<ConstantInt>(EmitScalarExpr(E->getArg(3)));
 
-    // Compute pointee bit-width from arg0 and create as i32 constant
     QualType ValQT =
         E->getArg(0)->getType()->castAs<PointerType>()->getPointeeType();
     unsigned SizeBits = getContext().getTypeSize(ValQT);
-    auto *SizeC = llvm::ConstantInt::get(Int32Ty, SizeBits);
+    auto *StoreTy = Builder.getIntNTy(SizeBits);
+    Value *StoreBits;
+    if (ValQT->isIntegerType()) {
+      StoreBits = Builder.CreateIntCast(StoreValue, StoreTy,
+                                        ValQT->isSignedIntegerType());
+    } else if (ValQT->isPointerType()) {
+      StoreBits = Builder.CreatePtrToInt(StoreValue, StoreTy);
+    } else {
+      assert(ValQT->isRealFloatingType() &&
+             "unexpected type for stshh atomic store");
+      StoreBits = Builder.CreateBitCast(StoreValue, StoreTy);
+    }
 
-    Value *StoreValue64 = Builder.CreateIntCast(StoreValue, Int64Ty,
-                                                ValQT->isSignedIntegerType());
+    Value *StoreValue64 = Builder.CreateZExt(StoreBits, Int64Ty);
+
+    switch (OrderC->getZExtValue()) {
+    case 0:
+      break;
+    case 3:
+      Builder.CreateFence(llvm::AtomicOrdering::Release);
+      break;
+    case 5:
+      Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
+      break;
+    default:
+      llvm_unreachable("unexpected memory order for stshh atomic store");
+    }
 
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_stshh_atomic_store,
                                    {StoreAddr->getType()});
@@ -4683,7 +4705,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return Builder.CreateCall(
         F, {StoreAddr, StoreValue64,
             ConstantInt::get(Int32Ty, OrderC->getZExtValue()),
-            ConstantInt::get(Int32Ty, PolicyC->getZExtValue()), SizeC});
+            ConstantInt::get(Int32Ty, PolicyC->getZExtValue()),
+            ConstantInt::get(Int32Ty, SizeBits)});
   }
 
   if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index e54c8228e5ff8..4ca2b0603317d 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1154,7 +1154,10 @@ static bool CheckAArch64AtomicStoreWithStshhCall(SemaARM 
&S,
   }
 
   ValType = ValType.getUnqualifiedType();
-  unsigned Bits = ValType->isIntegerType() ? Context.getTypeSize(ValType) : 0;
+  bool IsSupportedType = ValType->isIntegerType() ||
+                         ValType->isRealFloatingType() ||
+                         ValType->isPointerType();
+  unsigned Bits = IsSupportedType ? Context.getTypeSize(ValType) : 0;
   if (Bits != 8 && Bits != 16 && Bits != 32 && Bits != 64) {
     SemaRef.Diag(PointerArg->getBeginLoc(),
                  diag::err_arm_atomic_store_with_stshh_bad_type)
diff --git a/clang/test/CodeGen/AArch64/pcdphint-atomic-store-order.c 
b/clang/test/CodeGen/AArch64/pcdphint-atomic-store-order.c
new file mode 100644
index 0000000000000..fc716d34c1787
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/pcdphint-atomic-store-order.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +v9.6a -O2 
-emit-llvm -o - %s | FileCheck %s --check-prefix=IR
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +v9.6a -O2 
-S -o - %s | FileCheck %s --check-prefix=ASM
+
+#include <arm_acle.h>
+
+// IR-LABEL: define dso_local void @test_release_store_ordering(
+// IR:       if.then:
+// IR-NEXT:    store i32 %v, ptr %b, align 4
+// IR-NEXT:    %0 = zext i32 %v to i64
+// IR-NEXT:    fence release
+// IR-NEXT:    tail call void @llvm.aarch64.stshh.atomic.store.p0(ptr %a, i64 
%0, i32 3, i32 0, i32 32)
+// IR-NEXT:    br label %if.end
+//
+// ASM-LABEL: test_release_store_ordering:
+// ASM:       str w2, [x1]
+// ASM-NEXT:  mov w8, w2
+// ASM-NEXT:  dmb ish
+// ASM-NEXT:  stshh keep
+// ASM-NEXT:  stlr w8, [x0]
+void test_release_store_ordering(int *__restrict a, int *__restrict b, int v,
+                                 int c) {
+  if (c) {
+    *b = v;
+    __arm_atomic_store_with_stshh(a, v, __ATOMIC_RELEASE, 0);
+  } else {
+    *b = v + 1;
+  }
+}
+
+// ASM-LABEL: demo_f32:
+// ASM:       fmov w8, s0
+// ASM-NEXT:  dmb ish
+// ASM-NEXT:  stshh keep
+// ASM-NEXT:  stlr w8, [x0]
+void demo_f32(float *p, float v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELEASE, 0);
+}
diff --git a/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c 
b/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
index f48f1d6344bc5..0ee0f67a51aa8 100644
--- a/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
+++ b/clang/test/CodeGen/AArch64/pcdphint-atomic-store.c
@@ -30,6 +30,7 @@ void test_u8(unsigned char *p, unsigned char v) {
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[V_ADDR]], align 2
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i64
+// CHECK-NEXT:    fence release
 // CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0(ptr [[TMP0]], 
i64 [[TMP2]], i32 3, i32 1, i32 16)
 // CHECK-NEXT:    ret void
 //
@@ -47,6 +48,7 @@ void test_u16(unsigned short *p, unsigned short v) {
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-NEXT:    fence seq_cst
 // CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0(ptr [[TMP0]], 
i64 [[TMP2]], i32 5, i32 0, i32 32)
 // CHECK-NEXT:    ret void
 //
@@ -69,3 +71,39 @@ void test_u32(unsigned int *p, unsigned int v) {
 void test_u64(unsigned long *p, unsigned long v) {
   __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 1);
 }
+
+// CHECK-LABEL: define dso_local void @test_f32(
+// CHECK-SAME: ptr noundef [[P:%.*]], float noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca float, align 4
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store float [[V]], ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[TMP1]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK-NEXT:    fence release
+// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0(ptr [[TMP0]], 
i64 [[TMP3]], i32 3, i32 0, i32 32)
+// CHECK-NEXT:    ret void
+//
+void test_f32(float *p, float v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELEASE, 0);
+}
+
+// CHECK-LABEL: define dso_local void @test_ptr(
+// CHECK-SAME: ptr noundef [[P:%.*]], ptr noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+// CHECK-NEXT:    call void @llvm.aarch64.stshh.atomic.store.p0(ptr [[TMP0]], 
i64 [[TMP2]], i32 0, i32 1, i32 64)
+// CHECK-NEXT:    ret void
+//
+void test_ptr(void **p, void *v) {
+  __arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 1);
+}
diff --git a/clang/test/Sema/AArch64/pcdphint-atomic-store.c 
b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
index 5b4bf27003a5a..c6ac5b804e9b1 100644
--- a/clang/test/Sema/AArch64/pcdphint-atomic-store.c
+++ b/clang/test/Sema/AArch64/pcdphint-atomic-store.c
@@ -17,20 +17,28 @@ void test_const_pointer(const unsigned int *p, unsigned int 
v) {
   // expected-error@-1 {{address argument to atomic builtin cannot be 
const-qualified}}
 }
 
-void test_non_integer_pointer(float *p, float v) {
+void test_float_ok(float *p, float v) {
   __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
-  // expected-error@-1 {{address argument to '__arm_atomic_store_with_stshh' 
must be a pointer to an 8,16,32, or 64-bit integer type}}
 }
 
-void test_invalid_bit_width(__int128 *p, __int128 v) {
+void test_pointer_ok(void **p, void *v) {
   __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
-  // expected-error@-1 {{address argument to '__arm_atomic_store_with_stshh' 
must be a pointer to an 8,16,32, or 64-bit integer type}}
 }
 
 struct IncompleteType;
+void test_invalid_struct_pointer(struct IncompleteType *p, int v) {
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  // expected-error@-1 {{address argument to '__arm_atomic_store_with_stshh' 
must be a pointer to an 8,16,32, or 64-bit integer, floating-point, or pointer 
type}}
+}
+
+void test_invalid_bit_width(__int128 *p, __int128 v) {
+  __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
+  // expected-error@-1 {{address argument to '__arm_atomic_store_with_stshh' 
must be a pointer to an 8,16,32, or 64-bit integer, floating-point, or pointer 
type}}
+}
+
 void test_incomplete_pointee(struct IncompleteType *p, int v) {
   __builtin_arm_atomic_store_with_stshh(p, v, __ATOMIC_RELAXED, 0);
-  // expected-error@-1 {{address argument to '__arm_atomic_store_with_stshh' 
must be a pointer to an 8,16,32, or 64-bit integer type}}
+  // expected-error@-1 {{address argument to '__arm_atomic_store_with_stshh' 
must be a pointer to an 8,16,32, or 64-bit integer, floating-point, or pointer 
type}}
 }
 
 void test_invalid_memory_order(unsigned int *p, unsigned int v) {
diff --git a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll 
b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
index 6e48cb348ca05..e7ec4325f8cac 100644
--- a/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
+++ b/llvm/test/CodeGen/AArch64/pcdphint-atomic-store.ll
@@ -72,6 +72,19 @@ define void @test_keep_release_i32(ptr %p, i64 %v) {
   ret void
 }
 
+define void @test_keep_release_f32(ptr %p, float %v) {
+; CHECK-LABEL: test_keep_release_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    stshh keep
+; CHECK-NEXT:    stlr w8, [x0]
+; CHECK-NEXT:    ret
+  %bits = bitcast float %v to i32
+  %ext = zext i32 %bits to i64
+  call void @llvm.aarch64.stshh.atomic.store.p0(ptr %p, i64 %ext, i32 3, i32 
0, i32 32)
+  ret void
+}
+
 define void @test_keep_release_i64(ptr %p, i64 %v) {
 ; CHECK-LABEL: test_keep_release_i64:
 ; CHECK:       // %bb.0:

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [AArch64][clang] Fix `__arm_atomic_store_with_stshh` ordering and lowering (PR #191301)

Reply via email to