https://github.com/zatrazz updated 
https://github.com/llvm/llvm-project/pull/194632

>From 231370b90ebc31baa7c3f2226e2a239ea013c82b Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <[email protected]>
Date: Thu, 9 Apr 2026 16:24:34 -0300
Subject: [PATCH 1/8] [AArch64] Add lowering for the
 _CountTrailingZeros/_CountTrailingZeros64 MS intrinsics

Lower to llvm.cttz with is_zero_undef=false, following the same pattern as
_CountLeadingZeros.  _CountTrailingZeros64 truncates the i64 cttz result to
i32 since the count is at most 63.

Documented at:
https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=msvc-180
---
 clang/include/clang/Basic/BuiltinsAArch64.td  |  2 ++
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |  4 +++-
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 13 +++++++++++
 clang/lib/Headers/intrin.h                    |  2 ++
 .../test/CodeGen/arm64-microsoft-intrinsics.c | 23 +++++++++++++++++++
 5 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/BuiltinsAArch64.td 
b/clang/include/clang/Basic/BuiltinsAArch64.td
index ba30e344911aa..6607fb1926065 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.td
+++ b/clang/include/clang/Basic/BuiltinsAArch64.td
@@ -394,6 +394,8 @@ let Attributes = [NoThrow, RequireDeclaration], Languages = 
"ALL_MS_LANGUAGES",
        def _CountLeadingZeros64 : AArch64NoPrefixTargetLibBuiltin<"unsigned 
int (unsigned long long int)">;
        def _CountOneBits        : AArch64NoPrefixTargetLibBuiltin<"unsigned 
int (msuint32_t)">;
        def _CountOneBits64      : AArch64NoPrefixTargetLibBuiltin<"unsigned 
int (unsigned long long int)">;
+       def _CountTrailingZeros  : AArch64NoPrefixTargetLibBuiltin<"unsigned 
int (msuint32_t)">;
+       def _CountTrailingZeros64: AArch64NoPrefixTargetLibBuiltin<"unsigned 
int (unsigned long long int)">;
 }
 
 let Attributes = [NoThrow, RequireDeclaration], Languages = 
"ALL_MS_LANGUAGES", Header = "intrin.h" in {
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 562f66d4cca16..05ae340448033 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2140,7 +2140,9 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
   }
 
   if (builtinID == AArch64::BI_CountOneBits ||
-      builtinID == AArch64::BI_CountOneBits64) {
+      builtinID == AArch64::BI_CountOneBits64 ||
+      builtinID == AArch64::BI_CountTrailingZeros ||
+      builtinID == AArch64::BI_CountTrailingZeros64) {
     cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented AArch64 builtin call: ") +
                      getContext().BuiltinInfo.getName(builtinID));
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index f8990ced2a577..eba4f4539a0f1 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5235,6 +5235,19 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return Result;
   }
 
+  if (BuiltinID == AArch64::BI_CountTrailingZeros ||
+      BuiltinID == AArch64::BI_CountTrailingZeros64) {
+    Value *ArgValue = EmitScalarExpr(E->getArg(0));
+    llvm::Type *ArgType = ArgValue->getType();
+    Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
+
+    // MSVC leaves 0 undefined; use false for predictable codegen
+    Value *Result = Builder.CreateCall(F, {ArgValue, Builder.getInt1(false)});
+    if (BuiltinID == AArch64::BI_CountTrailingZeros64)
+      Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
+    return Result;
+  }
+
   if (BuiltinID == AArch64::BI__prefetch) {
     Value *Address = EmitScalarExpr(E->getArg(0));
     Value *RW = llvm::ConstantInt::get(Int32Ty, 0);
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index e5d08a217e05e..b90e340d2d766 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -441,6 +441,8 @@ unsigned int _CountLeadingSigns(long);
 unsigned int _CountLeadingSigns64(__int64);
 unsigned int _CountOneBits(unsigned long);
 unsigned int _CountOneBits64(unsigned __int64);
+unsigned int _CountTrailingZeros(unsigned long);
+unsigned int _CountTrailingZeros64(unsigned __int64);
 
 unsigned int __hlt(unsigned int, ...);
 
diff --git a/clang/test/CodeGen/arm64-microsoft-intrinsics.c 
b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
index 2f5ab50d6c848..4e0cabc5e11cd 100644
--- a/clang/test/CodeGen/arm64-microsoft-intrinsics.c
+++ b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
@@ -584,6 +584,29 @@ unsigned int check_CountOneBits64(unsigned __int64 arg1) {
 // CHECK-MSCOMPAT: ret i32 %[[VAR2]]
 // CHECK-LINUX: error: call to undeclared function '_CountOneBits64'
 
+unsigned int check_CountTrailingZeros(unsigned LONG arg1) {
+  return _CountTrailingZeros(arg1);
+}
+
+// CHECK-MSCOMPAT: %[[ARG1:.*]].addr = alloca i32, align 4
+// CHECK-MSCOMPAT: store i32 %[[ARG1]], ptr %[[ARG1]].addr, align 4
+// CHECK-MSCOMPAT: %[[VAR0:.*]] = load i32, ptr %[[ARG1]].addr, align 4
+// CHECK-MSCOMPAT: %[[VAR1:.*]] = call i32 @llvm.cttz.i32(i32 %[[VAR0]], i1 
false)
+// CHECK-MSCOMPAT: ret i32 %[[VAR1]]
+// CHECK-LINUX: error: call to undeclared function '_CountTrailingZeros'
+
+unsigned int check_CountTrailingZeros64(unsigned __int64 arg1) {
+  return _CountTrailingZeros64(arg1);
+}
+
+// CHECK-MSCOMPAT: %[[ARG1:.*]].addr = alloca i64, align 8
+// CHECK-MSCOMPAT: store i64 %[[ARG1]], ptr %[[ARG1]].addr, align 8
+// CHECK-MSCOMPAT: %[[VAR0:.*]] = load i64, ptr %[[ARG1]].addr, align 8
+// CHECK-MSCOMPAT: %[[VAR1:.*]] = call i64 @llvm.cttz.i64(i64 %[[VAR0]], i1 
false)
+// CHECK-MSCOMPAT: %[[VAR2:.*]] = trunc i64 %[[VAR1]] to i32
+// CHECK-MSCOMPAT: ret i32 %[[VAR2]]
+// CHECK-LINUX: error: call to undeclared function '_CountTrailingZeros64'
+
 void check__prefetch(void *arg1) {
   return __prefetch(arg1);
 }

>From 8c106cb45ac3094d82f4784df58fcbc1236ed4f9 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <[email protected]>
Date: Sat, 18 Apr 2026 08:37:49 -0300
Subject: [PATCH 2/8] [AArch64] Add ARM64_SYSREG, ARM64_FPCR, and ARM64_FPSR to
 arm64intr.h

These macros let Windows AArch64 programs construct system-register
encodings for use with the existing _ReadStatusReg/_WriteStatusReg
intrinsics.  ARM64_FPCR and ARM64_FPSR are the most commonly used
predefined constants.  The ARM64_SYSREG macro only supports op0=2 or 3
(the op0 MSB is implicit in the encoding).
---
 clang/lib/Headers/arm64intr.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/clang/lib/Headers/arm64intr.h b/clang/lib/Headers/arm64intr.h
index 4943b2db69d02..53a3d57a6e9d1 100644
--- a/clang/lib/Headers/arm64intr.h
+++ b/clang/lib/Headers/arm64intr.h
@@ -15,6 +15,16 @@
 #ifndef __ARM64INTR_H
 #define __ARM64INTR_H
 
+/* Encode an AArch64 system register for use with
+   _ReadStatusReg/_WriteStatusReg. op0 must be 2 or 3; only the low bit is
+   stored. */
+#define ARM64_SYSREG(op0, op1, CRn, CRm, op2)                                  
\
+  ((((op0) & 0x1) << 14) | (((op1) & 0x7) << 11) | (((CRn) & 0xF) << 7) |      
\
+   (((CRm) & 0xF) << 3) | ((op2) & 0x7))
+
+#define ARM64_FPCR ARM64_SYSREG(3, 3, 4, 4, 0)
+#define ARM64_FPSR ARM64_SYSREG(3, 3, 4, 4, 1)
+
 typedef enum
 {
   _ARM64_BARRIER_SY    = 0xF,

>From 0fe063dd125e2d460ddc935b901ed9c3ce80b051 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <[email protected]>
Date: Thu, 30 Apr 2026 16:52:08 -0300
Subject: [PATCH 3/8] [AArch64] Use llvm.read_volatile_register for the
 __getReg MS intrinsic

llvm.read_register carries IntrReadMem, which allows the compiler to
CSE repeated reads or eliminate the result if it goes unused.
llvm.read_volatile_register carries IntrHasSideEffects, which prevents
both.  The __getReg targets hardware registers (e.g. x18, the platform
thread-pointer on Windows AArch64) whose values can change between calls
or that must be observed even if the result is discarded.
---
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp        | 2 +-
 clang/test/CodeGen/arm64-microsoft-intrinsics.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index eba4f4539a0f1..c88313f7b53e7 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -4824,7 +4824,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
 
     llvm::Function *F =
-        CGM.getIntrinsic(Intrinsic::read_register, {Int64Ty});
+        CGM.getIntrinsic(Intrinsic::read_volatile_register, {Int64Ty});
     return Builder.CreateCall(F, Metadata);
   }
 
diff --git a/clang/test/CodeGen/arm64-microsoft-intrinsics.c 
b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
index 4e0cabc5e11cd..79c5a8c823224 100644
--- a/clang/test/CodeGen/arm64-microsoft-intrinsics.c
+++ b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
@@ -154,8 +154,8 @@ unsigned __int64 check__getReg(void) {
   return reg;
 }
 
-// CHECK-MSCOMPAT: call i64 @llvm.read_register.i64(metadata ![[MD2:.*]])
-// CHECK-MSCOMPAT: call i64 @llvm.read_register.i64(metadata ![[MD3:.*]])
+// CHECK-MSCOMPAT: call i64 @llvm.read_volatile_register.i64(metadata 
![[MD2:.*]])
+// CHECK-MSCOMPAT: call i64 @llvm.read_volatile_register.i64(metadata 
![[MD3:.*]])
 
 #ifdef __LP64__
 #define LONG __int32

>From b4a84ecd9e3e06edf7ad7969e0b748dd0146708a Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <[email protected]>
Date: Thu, 30 Apr 2026 20:22:59 -0300
Subject: [PATCH 4/8] [IR] Add llvm.write_volatile_register intrinsic

Add llvm.write_volatile_register as the write-side counterpart to
llvm.read_volatile_register.  The default MemoryEffects::unknown()
prevents the optimizer from reordering or eliminating the call.

Unlike llvm.write_register, the SelectionDAG lowering additionally emits
a FAKE_USE of the target physical register directly (via RegisterSDNode)
after the WRITE_REGISTER node.  This marks the register live and prevents
the backend from dead-eliminating the write.  It is preferred over a
READ_REGISTER node, which would emit extra register copies (e.g. fmov xN,
dN for FP/SIMD registers).

The primary use case is lowering MS-compat __setReg and __setRegFp
intrinsics on AArch64, where the target register may be in a different
register class (integer vs. FP/SIMD) from the source value.
---
 llvm/docs/LangRef.rst                         | 47 +++++++----
 llvm/include/llvm/IR/Intrinsics.td            |  2 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 29 +++++++
 llvm/lib/IR/Verifier.cpp                      |  8 ++
 .../AArch64/write-volatile-register.ll        | 77 +++++++++++++++++++
 5 files changed, 147 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/write-volatile-register.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ee6dd32e5e852..199c7c6cd6347 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15248,9 +15248,10 @@ called).
 .. _int_read_register:
 .. _int_read_volatile_register:
 .. _int_write_register:
+.. _int_write_volatile_register:
 
-'``llvm.read_register``', '``llvm.read_volatile_register``', and 
'``llvm.write_register``' Intrinsics
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.read_register``', '``llvm.read_volatile_register``', 
'``llvm.write_register``', and '``llvm.write_volatile_register``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
@@ -15263,26 +15264,30 @@ Syntax:
       declare i64 @llvm.read_volatile_register.i64(metadata)
       declare void @llvm.write_register.i32(metadata, i32 @value)
       declare void @llvm.write_register.i64(metadata, i64 @value)
+      declare void @llvm.write_volatile_register.i32(metadata, i32 @value)
+      declare void @llvm.write_volatile_register.i64(metadata, i64 @value)
       !0 = !{!"sp\00"}
 
 Overview:
 """""""""
 
-The '``llvm.read_register``', '``llvm.read_volatile_register``', and
-'``llvm.write_register``' intrinsics provide access to the named register.
-The register must be valid on the architecture being compiled to. The type
-needs to be compatible with the register being read.
+The '``llvm.read_register``', '``llvm.read_volatile_register``',
+'``llvm.write_register``', and '``llvm.write_volatile_register``' intrinsics
+provide access to the named register. The register must be valid on the
+architecture being compiled to. The type needs to be compatible with the
+register being accessed.
 
 Semantics:
 """"""""""
 
 The '``llvm.read_register``' and '``llvm.read_volatile_register``' intrinsics
 return the current value of the register, where possible. The
-'``llvm.write_register``' intrinsic sets the current value of the register,
-where possible.
+'``llvm.write_register``' and '``llvm.write_volatile_register``' intrinsics
+set the current value of the register, where possible.
 
-A call to '``llvm.read_volatile_register``' is assumed to have side-effects
-and possibly return a different value each time (e.g., for a timer register).
+A call to '``llvm.read_volatile_register``' or
+'``llvm.write_volatile_register``' is assumed to have side-effects and will
+not be reordered or eliminated by the optimizer.
 
 This is useful to implement named register global variables that need
 to always be mapped to a specific register, as is common practice on
@@ -15290,12 +15295,22 @@ bare-metal programs including OS kernels.
 
 The compiler doesn't check for register availability or use of the used
 register in surrounding code, including inline assembly. Because of that,
-allocatable registers are not supported.
-
-Warning: So far it only works with the stack pointer on selected
-architectures (ARM, AArch64, PowerPC and x86_64). Significant amount of
-work is needed to support other registers and even more so, allocatable
-registers.
+allocatable registers are not supported by '``llvm.read_register``',
+'``llvm.read_volatile_register``', or '``llvm.write_register``'.
+
+'``llvm.write_volatile_register``' supports allocatable registers. Writing
+to an allocatable register means the value is copied into that physical
+register at the point of the call; the register may subsequently be
+reused by the register allocator for other purposes. The backend emits a
+``FAKE_USE`` of the physical register after the write to prevent the store
+from being dead-eliminated before register allocation.
+
+Warning: Register support is target-specific. The IR-level verifier does
+not validate register names; an unsupported name results in a fatal error
+during code generation. Supported registers vary by target and can be
+found in each target's ``getRegisterByName`` implementation.
+'``llvm.write_volatile_register``' support for allocatable registers is
+currently only implemented on AArch64.
 
 .. _int_stacksave:
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td 
b/llvm/include/llvm/IR/Intrinsics.td
index 993ddd7e33701..3e26fe406cebf 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -913,6 +913,8 @@ def int_read_register  : 
DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_metadata_
                                    [IntrReadMem], "llvm.read_register">;
 def int_write_register : Intrinsic<[], [llvm_metadata_ty, llvm_anyint_ty],
                                    [IntrNoCallback], "llvm.write_register">;
+def int_write_volatile_register : Intrinsic<[], [llvm_metadata_ty, 
llvm_anyint_ty],
+                                            [], 
"llvm.write_volatile_register">;
 def int_read_volatile_register  : Intrinsic<[llvm_anyint_ty], 
[llvm_metadata_ty],
                                             [IntrHasSideEffects],
                                              "llvm.read_volatile_register">;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp 
b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 579bff7d3ab60..f1e8d80175da5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6692,6 +6692,35 @@ void SelectionDAGBuilder::visitIntrinsicCall(const 
CallInst &I,
                             RegName, getValue(RegValue)));
     return;
   }
+  case Intrinsic::write_volatile_register: {
+    Value *Reg = I.getArgOperand(0);
+    Value *RegValue = I.getArgOperand(1);
+    SDValue Chain = getRoot();
+    const MDNode *MD = cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata());
+    SDValue RegName = DAG.getMDNode(MD);
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), RegValue->getType());
+    SDValue WriteChain = DAG.getNode(ISD::WRITE_REGISTER, sdl, MVT::Other,
+                                     Chain, RegName, getValue(RegValue));
+    // FAKE_USE of the physical register marks it live after the 
WRITE_REGISTER,
+    // preventing the backend from dead-eliminating the write.  This is
+    // preferred over READ_REGISTER, which would emit extra register copies
+    // (e.g. fmov xN, dN for FP/SIMD registers).
+    const MDString *RegStr = cast<MDString>(MD->getOperand(0));
+    LLT Ty = VT.isSimple() ? getLLTForMVT(VT.getSimpleVT()) : LLT();
+    const MachineFunction &MF = DAG.getMachineFunction();
+    Register PhysReg =
+        TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
+    if (PhysReg.isValid()) {
+      const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
+      MVT RegVT = *TRI->legalclasstypes_begin(*RC);
+      DAG.setRoot(DAG.getNode(ISD::FAKE_USE, sdl, MVT::Other,
+                              {WriteChain, DAG.getRegister(PhysReg, RegVT)}));
+    } else {
+      DAG.setRoot(WriteChain);
+    }
+    return;
+  }
   case Intrinsic::memcpy:
   case Intrinsic::memcpy_inline: {
     const auto &MCI = cast<MemCpyInst>(I);
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 9c6c5f245ff0b..79bd3dc984b9d 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -7360,6 +7360,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, 
CallBase &Call) {
           "llvm.sponentry must return a pointer to the stack", &Call);
     break;
   }
+  case Intrinsic::write_volatile_register: {
+    auto *MD = cast<MDNode>(
+        cast<MetadataAsValue>(Call.getArgOperand(0))->getMetadata());
+    Check(MD->getNumOperands() == 1 && isa<MDString>(MD->getOperand(0)),
+          "llvm.write_volatile_register metadata must be a single MDString",
+          &Call);
+    break;
+  }
   };
 
   // Verify that there aren't any unmediated control transfers between 
funclets.
diff --git a/llvm/test/CodeGen/AArch64/write-volatile-register.ll 
b/llvm/test/CodeGen/AArch64/write-volatile-register.ll
new file mode 100644
index 0000000000000..ad8292905583a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/write-volatile-register.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 -fast-isel=0 -global-isel=false < %s | FileCheck %s
+
+; Tests for llvm.write_volatile_register on AArch64.
+;
+; Unlike llvm.write_register, the volatile variant carries IntrHasSideEffects
+; and emits a FAKE_USE of the target physical register after the 
WRITE_REGISTER.
+; The FAKE_USE prevents the CopyToReg from being dead-eliminated while avoiding
+; an extra cross-domain read (fmov xN, dM) that would arise if we kept the
+; register live via READ_REGISTER + CopyFromReg instead.
+;
+; For Windows-specific tests with reserved GP register x18 see
+; clang/test/CodeGen/arm64-microsoft-intrinsics.c.
+
+; -- Stack pointer -----------------------------------------------------------
+; sp is the canonical GP test: it is always accessible regardless of ABI.
+
+define void @write_volatile_sp(i64 %val) {
+; CHECK-LABEL: write_volatile_sp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov sp, x0
+; CHECK-NEXT:    // fake_use: $sp
+; CHECK-NEXT:    ret
+  call void @llvm.write_volatile_register.i64(metadata !0, i64 %val)
+  ret void
+}
+
+; -- FP/SIMD d-registers: integer bit-pattern -> FP register -----------------
+;
+; The caller passes an i64 bit-pattern (e.g. obtained from 
read_volatile_register).
+; Writing it into a SIMD d-register requires a cross-domain integer->FP move
+; (fmov dN, xM).  The FAKE_USE ensures the CopyToReg is not dead-eliminated
+; even though dN has no further GP-domain uses in this function.
+
+define void @write_volatile_d5(i64 %bits) {
+; CHECK-LABEL: write_volatile_d5:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d5, x0
+; CHECK-NEXT:    // fake_use: $d5
+; CHECK-NEXT:    ret
+  call void @llvm.write_volatile_register.i64(metadata !1, i64 %bits)
+  ret void
+}
+
+define void @write_volatile_d31(i64 %bits) {
+; CHECK-LABEL: write_volatile_d31:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d31, x0
+; CHECK-NEXT:    // fake_use: $d31
+; CHECK-NEXT:    ret
+  call void @llvm.write_volatile_register.i64(metadata !2, i64 %bits)
+  ret void
+}
+
+; -- Back-to-back writes must both survive ------------------------------------
+;
+; Each call carries IntrHasSideEffects so neither may be suppressed on behalf
+; of the other.  Verify that both fmov + fake_use pairs appear in order.
+
+define void @write_volatile_d5_twice(i64 %a, i64 %b) {
+; CHECK-LABEL: write_volatile_d5_twice:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d5, x0
+; CHECK-NEXT:    // fake_use: $d5
+; CHECK-NEXT:    fmov d5, x1
+; CHECK-NEXT:    // fake_use: $d5
+; CHECK-NEXT:    ret
+  call void @llvm.write_volatile_register.i64(metadata !1, i64 %a)
+  call void @llvm.write_volatile_register.i64(metadata !1, i64 %b)
+  ret void
+}
+
+declare void @llvm.write_volatile_register.i64(metadata, i64)
+
+!0 = !{!"sp"}
+!1 = !{!"d5"}
+!2 = !{!"d31"}

>From 03ea3dcff7b5598ac708989b1815029eaac39094 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <[email protected]>
Date: Sat, 18 Apr 2026 09:13:27 -0300
Subject: [PATCH 5/8] [AArch64] Add the __setReg MS intrinsic for writing GP
 registers by index
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

__setReg(n, v) writes a 64-bit value into the GP register xN (x0–x28 or
sp for n=31).  The lowering reuses the __getReg register-name computation
and emits llvm.write_volatile_register to prevent the store from being
dead-eliminated by the backend.

Documented at:
https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=msvc-180
---
 clang/include/clang/Basic/BuiltinsAArch64.td    |  1 +
 clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |  3 ++-
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp        | 17 +++++++++++++----
 clang/lib/Headers/intrin.h                      |  1 +
 clang/lib/Sema/SemaARM.cpp                      |  2 +-
 clang/test/CodeGen/arm64-microsoft-intrinsics.c | 13 +++++++++++++
 6 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAArch64.td 
b/clang/include/clang/Basic/BuiltinsAArch64.td
index 6607fb1926065..5f6eda3e25418 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.td
+++ b/clang/include/clang/Basic/BuiltinsAArch64.td
@@ -335,6 +335,7 @@ let Attributes = [NoThrow, RequireDeclaration], Languages = 
"ALL_MS_LANGUAGES",
 let Attributes = [NoThrow, RequireDeclaration], Languages = 
"ALL_MS_LANGUAGES", Header = "intrin.h" in {
        def _ReadWriteBarrier       : AArch64NoPrefixTargetLibBuiltin<"void 
()">;
        def __getReg                : AArch64NoPrefixTargetLibBuiltin<"unsigned 
long long int (int)">;
+       def __setReg                : AArch64NoPrefixTargetLibBuiltin<"void 
(int, unsigned long long int)">;
        def _ReadStatusReg          : AArch64NoPrefixTargetLibBuiltin<"long 
long int (int)">;
        def _WriteStatusReg         : AArch64NoPrefixTargetLibBuiltin<"void 
(int, long long int)">;
        def __sys                   : AArch64NoPrefixTargetLibBuiltin<"unsigned 
int (int, long long int)">;
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 05ae340448033..bac78d5eefc86 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -1930,7 +1930,8 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
-  if (builtinID == clang::AArch64::BI__getReg) {
+  if (builtinID == clang::AArch64::BI__getReg ||
+      builtinID == clang::AArch64::BI__setReg) {
     cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented AArch64 builtin call: ") +
                      getContext().BuiltinInfo.getName(builtinID));
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index c88313f7b53e7..d66e5b554b483 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -4810,7 +4810,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return CI;
   }
 
-  if (BuiltinID == clang::AArch64::BI__getReg) {
+  if (BuiltinID == clang::AArch64::BI__getReg ||
+      BuiltinID == clang::AArch64::BI__setReg) {
     Expr::EvalResult Result;
     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
       llvm_unreachable("Sema will ensure that the parameter is constant");
@@ -4823,9 +4824,17 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
 
-    llvm::Function *F =
-        CGM.getIntrinsic(Intrinsic::read_volatile_register, {Int64Ty});
-    return Builder.CreateCall(F, Metadata);
+    CallInst *CI;
+    if (BuiltinID == clang::AArch64::BI__getReg) {
+      llvm::Function *F =
+          CGM.getIntrinsic(Intrinsic::read_volatile_register, {Int64Ty});
+      CI = Builder.CreateCall(F, Metadata);
+    } else {
+      llvm::Function *F =
+          CGM.getIntrinsic(Intrinsic::write_volatile_register, {Int64Ty});
+      CI = Builder.CreateCall(F, {Metadata, EmitScalarExpr(E->getArg(1))});
+    }
+    return CI;
   }
 
   if (BuiltinID == clang::AArch64::BI__break) {
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index b90e340d2d766..4815d536dbe46 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -377,6 +377,7 @@ static __inline__ void __DEFAULT_FN_ATTRS __nop(void) {
 
\*----------------------------------------------------------------------------*/
 #if defined(__aarch64__) || defined(__arm64ec__)
 unsigned __int64 __getReg(int);
+void __setReg(int, unsigned __int64);
 unsigned char _interlockedbittestandreset_acq(long volatile *, long);
 unsigned char _interlockedbittestandreset_nf(long volatile *, long);
 unsigned char _interlockedbittestandreset_rel(long volatile *, long);
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index f57c9c8b87cd5..16a93ad41ced5 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1174,7 +1174,7 @@ bool SemaARM::CheckAArch64BuiltinFunctionCall(const 
TargetInfo &TI,
   if (BuiltinID == AArch64::BI__sys)
     return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 0x3fff);
 
-  if (BuiltinID == AArch64::BI__getReg)
+  if (BuiltinID == AArch64::BI__getReg || BuiltinID == AArch64::BI__setReg)
     return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 31);
 
   if (BuiltinID == AArch64::BI__break)
diff --git a/clang/test/CodeGen/arm64-microsoft-intrinsics.c 
b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
index 79c5a8c823224..3aa0aa312d269 100644
--- a/clang/test/CodeGen/arm64-microsoft-intrinsics.c
+++ b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
@@ -157,6 +157,19 @@ unsigned __int64 check__getReg(void) {
 // CHECK-MSCOMPAT: call i64 @llvm.read_volatile_register.i64(metadata 
![[MD2:.*]])
 // CHECK-MSCOMPAT: call i64 @llvm.read_volatile_register.i64(metadata 
![[MD3:.*]])
 
+void test__setReg(unsigned __int64 v)
+{
+  __setReg(18, v);
+  __setReg(31, v);
+}
+
+// CHECK-MSCOMPAT-LABEL: define{{.*}}void @test__setReg(i64{{.*}}%v){{.*}}{
+// CHECK-MSCOMPAT: %[[DATA_ADDR1:.*]] = load i64, ptr %v.addr, align 8
+// CHECK-MSCOMPAT: call void @llvm.write_volatile_register.i64(metadata 
![[MD2]], i64 %[[DATA_ADDR1]])
+// CHECK-MSCOMPAT: %[[DATA_ADDR2:.*]] = load i64, ptr %v.addr, align 8
+// CHECK-MSCOMPAT: call void @llvm.write_volatile_register.i64(metadata 
![[MD3]], i64 %[[DATA_ADDR2]])
+// CHECK-LINUX: error: call to undeclared function '__setReg'
+
 #ifdef __LP64__
 #define LONG __int32
 #else

>From fbbea5e9fd0afc599ce0e36715f7b5277de1b728 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <[email protected]>
Date: Sun, 19 Apr 2026 09:41:12 -0300
Subject: [PATCH 6/8] [AArch64] Add the __setRegFp MS intrinsic for writing
 FP/SIMD d-registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

__setRegFp(n, v) writes a double into d-register dN (0–31) by index.
The value is bitcast to i64 and passed to llvm.write_volatile_register,
ensuring the store survives the register allocator.  The codegen for
__getRegFp and __setRegFp is unified in a single if-block.

Documented at:
https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=msvc-180
---
 clang/include/clang/Basic/BuiltinsAArch64.td  |  1 +
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |  3 +-
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 19 ++++++++++++
 clang/lib/Headers/intrin.h                    |  1 +
 clang/lib/Sema/SemaARM.cpp                    |  3 +-
 .../test/CodeGen/arm64-microsoft-intrinsics.c | 17 +++++++++++
 llvm/test/CodeGen/AArch64/read-fp-reg.ll      | 29 +++++++++++++++++++
 7 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/read-fp-reg.ll

diff --git a/clang/include/clang/Basic/BuiltinsAArch64.td 
b/clang/include/clang/Basic/BuiltinsAArch64.td
index 5f6eda3e25418..c9d8fc1d94ed2 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.td
+++ b/clang/include/clang/Basic/BuiltinsAArch64.td
@@ -335,6 +335,7 @@ let Attributes = [NoThrow, RequireDeclaration], Languages = 
"ALL_MS_LANGUAGES",
 let Attributes = [NoThrow, RequireDeclaration], Languages = 
"ALL_MS_LANGUAGES", Header = "intrin.h" in {
        def _ReadWriteBarrier       : AArch64NoPrefixTargetLibBuiltin<"void 
()">;
        def __getReg                : AArch64NoPrefixTargetLibBuiltin<"unsigned 
long long int (int)">;
+       def __getRegFp              : AArch64NoPrefixTargetLibBuiltin<"double 
(int)">;
        def __setReg                : AArch64NoPrefixTargetLibBuiltin<"void 
(int, unsigned long long int)">;
        def _ReadStatusReg          : AArch64NoPrefixTargetLibBuiltin<"long 
long int (int)">;
        def _WriteStatusReg         : AArch64NoPrefixTargetLibBuiltin<"void 
(int, long long int)">;
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index bac78d5eefc86..303fa4efae1cd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -1931,7 +1931,8 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
   }
 
   if (builtinID == clang::AArch64::BI__getReg ||
-      builtinID == clang::AArch64::BI__setReg) {
+      builtinID == clang::AArch64::BI__setReg ||
+      builtinID == clang::AArch64::BI__getRegFp) {
     cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented AArch64 builtin call: ") +
                      getContext().BuiltinInfo.getName(builtinID));
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index d66e5b554b483..ebaf317a94e56 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -4837,6 +4837,25 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return CI;
   }
 
+  if (BuiltinID == clang::AArch64::BI__getRegFp) {
+    Expr::EvalResult Result;
+    if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
+      llvm_unreachable("Sema will ensure that the parameter is constant");
+
+    llvm::APSInt Value = Result.Val.getInt();
+    LLVMContext &Context = CGM.getLLVMContext();
+    std::string Reg = "d" + toString(Value, 10);
+
+    llvm::Metadata *Ops[] = {llvm::MDString::get(Context, Reg)};
+    llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
+    llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
+
+    llvm::Function *F =
+        CGM.getIntrinsic(Intrinsic::read_volatile_register, {Int64Ty});
+    llvm::Value *Bits = Builder.CreateCall(F, Metadata);
+    return Builder.CreateBitCast(Bits, llvm::Type::getDoubleTy(Context));
+  }
+
   if (BuiltinID == clang::AArch64::BI__break) {
     Expr::EvalResult Result;
     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index 4815d536dbe46..2a0b7d5922ac2 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -377,6 +377,7 @@ static __inline__ void __DEFAULT_FN_ATTRS __nop(void) {
 
\*----------------------------------------------------------------------------*/
 #if defined(__aarch64__) || defined(__arm64ec__)
 unsigned __int64 __getReg(int);
+double __getRegFp(int _Reg);
 void __setReg(int, unsigned __int64);
 unsigned char _interlockedbittestandreset_acq(long volatile *, long);
 unsigned char _interlockedbittestandreset_nf(long volatile *, long);
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 16a93ad41ced5..215d0daa3fb02 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1174,7 +1174,8 @@ bool SemaARM::CheckAArch64BuiltinFunctionCall(const 
TargetInfo &TI,
   if (BuiltinID == AArch64::BI__sys)
     return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 0x3fff);
 
-  if (BuiltinID == AArch64::BI__getReg || BuiltinID == AArch64::BI__setReg)
+  if (BuiltinID == AArch64::BI__getReg || BuiltinID == AArch64::BI__setReg ||
+      BuiltinID == AArch64::BI__getRegFp)
     return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 31);
 
   if (BuiltinID == AArch64::BI__break)
diff --git a/clang/test/CodeGen/arm64-microsoft-intrinsics.c 
b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
index 3aa0aa312d269..1a1e81a26a714 100644
--- a/clang/test/CodeGen/arm64-microsoft-intrinsics.c
+++ b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
@@ -170,6 +170,21 @@ void test__setReg(unsigned __int64 v)
 // CHECK-MSCOMPAT: call void @llvm.write_volatile_register.i64(metadata 
![[MD3]], i64 %[[DATA_ADDR2]])
 // CHECK-LINUX: error: call to undeclared function '__setReg'
 
+double test__getRegFp(void)
+{
+  double volatile reg;
+  reg = __getRegFp(5);
+  reg = __getRegFp(31);
+  return reg;
+}
+
+// CHECK-MSCOMPAT-LABEL: define{{.*}}double @test__getRegFp(){{.*}}{
+// CHECK-MSCOMPAT:       [[BITS:%.*]] = call i64 
@llvm.read_volatile_register.i64(metadata ![[MD4:.*]])
+// CHECK-MSCOMPAT:       bitcast i64 [[BITS]] to double
+// CHECK-MSCOMPAT:       [[BITS:%.*]] = call i64 
@llvm.read_volatile_register.i64(metadata ![[MD5:.*]])
+// CHECK-MSCOMPAT:       bitcast i64 [[BITS]] to double
+// CHECK-LINUX: error: call to undeclared function '__getRegFp'
+
 #ifdef __LP64__
 #define LONG __int32
 #else
@@ -633,3 +648,5 @@ void check__prefetch(void *arg1) {
 
 // CHECK-MSCOMPAT: ![[MD2]] = !{!"x18"}
 // CHECK-MSCOMPAT: ![[MD3]] = !{!"sp"}
+// CHECK-MSCOMPAT: ![[MD4]] = !{!"d5"}
+// CHECK-MSCOMPAT: ![[MD5]] = !{!"d31"}
diff --git a/llvm/test/CodeGen/AArch64/read-fp-reg.ll 
b/llvm/test/CodeGen/AArch64/read-fp-reg.ll
new file mode 100644
index 0000000000000..853f1fc051716
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/read-fp-reg.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64 -fast-isel=0 -global-isel=false < %s | FileCheck %s
+
+define double @test_getRegFp_d5() {
+; CHECK-LABEL: test_getRegFp_d5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d5
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.read_volatile_register.i64(metadata !0)
+  %1 = bitcast i64 %0 to double
+  ret double %1
+}
+
+define double @test_getRegFp_d31() {
+; CHECK-LABEL: test_getRegFp_d31:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d31
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.read_volatile_register.i64(metadata !1)
+  %1 = bitcast i64 %0 to double
+  ret double %1
+}
+
+declare i64 @llvm.read_volatile_register.i64(metadata)
+
+!0 = !{!"d5"}
+!1 = !{!"d31"}

>From 422d8dc494489b94ae03ab91175f362b8f27bb84 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <[email protected]>
Date: Sun, 19 Apr 2026 10:03:37 -0300
Subject: [PATCH 7/8] [aarch64] Add support for the __setRegFp MS intrinsics

The builtin writes to a hardware floating-point register using an integer index.

It is ARM64 specific and it is documented at:
<https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=msvc-180>
---
 clang/include/clang/Basic/BuiltinsAArch64.td  |  1 +
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |  3 ++-
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 21 ++++++++++++++-----
 clang/lib/Headers/intrin.h                    |  1 +
 clang/lib/Sema/SemaARM.cpp                    |  2 +-
 .../test/CodeGen/arm64-microsoft-intrinsics.c | 12 +++++++++++
 6 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAArch64.td 
b/clang/include/clang/Basic/BuiltinsAArch64.td
index c9d8fc1d94ed2..873051720606c 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.td
+++ b/clang/include/clang/Basic/BuiltinsAArch64.td
@@ -336,6 +336,7 @@ let Attributes = [NoThrow, RequireDeclaration], Languages = 
"ALL_MS_LANGUAGES",
        def _ReadWriteBarrier       : AArch64NoPrefixTargetLibBuiltin<"void 
()">;
        def __getReg                : AArch64NoPrefixTargetLibBuiltin<"unsigned 
long long int (int)">;
        def __getRegFp              : AArch64NoPrefixTargetLibBuiltin<"double 
(int)">;
+       def __setRegFp              : AArch64NoPrefixTargetLibBuiltin<"void 
(int, double)">;
        def __setReg                : AArch64NoPrefixTargetLibBuiltin<"void 
(int, unsigned long long int)">;
        def _ReadStatusReg          : AArch64NoPrefixTargetLibBuiltin<"long 
long int (int)">;
        def _WriteStatusReg         : AArch64NoPrefixTargetLibBuiltin<"void 
(int, long long int)">;
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 303fa4efae1cd..e36f46f700d63 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -1932,7 +1932,8 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
 
   if (builtinID == clang::AArch64::BI__getReg ||
       builtinID == clang::AArch64::BI__setReg ||
-      builtinID == clang::AArch64::BI__getRegFp) {
+      builtinID == clang::AArch64::BI__getRegFp ||
+      builtinID == clang::AArch64::BI__setRegFp) {
     cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented AArch64 builtin call: ") +
                      getContext().BuiltinInfo.getName(builtinID));
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index ebaf317a94e56..9d8d9d7d97b49 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -4837,7 +4837,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return CI;
   }
 
-  if (BuiltinID == clang::AArch64::BI__getRegFp) {
+  if (BuiltinID == clang::AArch64::BI__getRegFp ||
+      BuiltinID == clang::AArch64::BI__setRegFp) {
     Expr::EvalResult Result;
     if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
       llvm_unreachable("Sema will ensure that the parameter is constant");
@@ -4850,10 +4851,20 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
     llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
 
-    llvm::Function *F =
-        CGM.getIntrinsic(Intrinsic::read_volatile_register, {Int64Ty});
-    llvm::Value *Bits = Builder.CreateCall(F, Metadata);
-    return Builder.CreateBitCast(Bits, llvm::Type::getDoubleTy(Context));
+    llvm::Value *Ret;
+    if (BuiltinID == clang::AArch64::BI__getRegFp) {
+      llvm::Function *F =
+          CGM.getIntrinsic(Intrinsic::read_volatile_register, {Int64Ty});
+      llvm::Value *Bits = Builder.CreateCall(F, Metadata);
+      Ret = Builder.CreateBitCast(Bits, llvm::Type::getDoubleTy(Context));
+    } else {
+      llvm::Value *Val = EmitScalarExpr(E->getArg(1));
+      llvm::Value *Bits = Builder.CreateBitCast(Val, Int64Ty);
+      llvm::Function *F =
+          CGM.getIntrinsic(Intrinsic::write_volatile_register, {Int64Ty});
+      Ret = Builder.CreateCall(F, {Metadata, Bits});
+    }
+    return Ret;
   }
 
   if (BuiltinID == clang::AArch64::BI__break) {
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index 2a0b7d5922ac2..7692d289fe90d 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -379,6 +379,7 @@ static __inline__ void __DEFAULT_FN_ATTRS __nop(void) {
 unsigned __int64 __getReg(int);
 double __getRegFp(int _Reg);
 void __setReg(int, unsigned __int64);
+void __setRegFp(int, double);
 unsigned char _interlockedbittestandreset_acq(long volatile *, long);
 unsigned char _interlockedbittestandreset_nf(long volatile *, long);
 unsigned char _interlockedbittestandreset_rel(long volatile *, long);
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 215d0daa3fb02..62f6d7296192d 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1175,7 +1175,7 @@ bool SemaARM::CheckAArch64BuiltinFunctionCall(const 
TargetInfo &TI,
     return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 0x3fff);
 
   if (BuiltinID == AArch64::BI__getReg || BuiltinID == AArch64::BI__setReg ||
-      BuiltinID == AArch64::BI__getRegFp)
+      BuiltinID == AArch64::BI__getRegFp || BuiltinID == AArch64::BI__setRegFp)
     return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 31);
 
   if (BuiltinID == AArch64::BI__break)
diff --git a/clang/test/CodeGen/arm64-microsoft-intrinsics.c 
b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
index 1a1e81a26a714..e20751806a60b 100644
--- a/clang/test/CodeGen/arm64-microsoft-intrinsics.c
+++ b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
@@ -185,6 +185,18 @@ double test__getRegFp(void)
 // CHECK-MSCOMPAT:       bitcast i64 [[BITS]] to double
 // CHECK-LINUX: error: call to undeclared function '__getRegFp'
 
+void test__setRegFp(double v)
+{
+  __setRegFp(5, v);
+  __setRegFp(31, v);
+}
+// CHECK-MSCOMPAT-LABEL: define{{.*}}void 
@test__setRegFp(double{{.*}}%v){{.*}}{
+// CHECK-MSCOMPAT:       [[BITS:%.*]] = bitcast double {{.*}} to i64
+// CHECK-MSCOMPAT:       call void @llvm.write_volatile_register.i64(metadata 
![[MD4:.*]], i64 [[BITS]])
+// CHECK-MSCOMPAT:       [[BITS:%.*]] = bitcast double {{.*}} to i64
+// CHECK-MSCOMPAT:       call void @llvm.write_volatile_register.i64(metadata 
![[MD5:.*]], i64 [[BITS]])
+// CHECK-LINUX: error: call to undeclared function '__setRegFp'
+
 #ifdef __LP64__
 #define LONG __int32
 #else

>From 4d3377d8acd9a0676ad0a049f088de934a522b91 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <[email protected]>
Date: Mon, 20 Apr 2026 13:11:40 -0300
Subject: [PATCH 8/8] [AArch64] Add the __prefetch2 MS intrinsic

__prefetch2(ptr, hint) issues a PRFM instruction with an explicit 5-bit
opcode.  The hint encodes the PRFM opcode directly: bits[4:3]=type
(PLD/PLI/PST), bits[2:1]=target (L1/L2/L3), bit[0]=policy (KEEP/STRM).
The argument must be a compile-time constant in [0, 31].

Neither the MSVC headers nor the documentation define named constants for
building the hint; the caller is expected to construct the 5-bit field
directly from the AArch64 PRFM encoding.

Documented at:
https://learn.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=msvc-180
---
 clang/include/clang/Basic/BuiltinsAArch64.td  |  1 +
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |  3 ++-
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 20 +++++++++++++++++++
 clang/lib/Headers/intrin.h                    |  1 +
 clang/lib/Sema/SemaARM.cpp                    |  3 +++
 .../test/CodeGen/arm64-microsoft-intrinsics.c | 10 ++++++++++
 6 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/BuiltinsAArch64.td 
b/clang/include/clang/Basic/BuiltinsAArch64.td
index 873051720606c..15257f3db5b41 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.td
+++ b/clang/include/clang/Basic/BuiltinsAArch64.td
@@ -403,6 +403,7 @@ let Attributes = [NoThrow, RequireDeclaration], Languages = 
"ALL_MS_LANGUAGES",
 
 let Attributes = [NoThrow, RequireDeclaration], Languages = 
"ALL_MS_LANGUAGES", Header = "intrin.h" in {
        def __prefetch : AArch64NoPrefixTargetLibBuiltin<"void (void const *)">;
+       def __prefetch2 : AArch64NoPrefixTargetLibBuiltin<"void (void const *, 
unsigned char)">;
 }
 
 let Attributes = [NoThrow, RequireDeclaration], Languages = 
"ALL_MS_LANGUAGES", Header = "intrin.h" in {
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index e36f46f700d63..50f7e4ca82713 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -2152,7 +2152,8 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned 
builtinID, const CallExpr *expr,
     return mlir::Value{};
   }
 
-  if (builtinID == AArch64::BI__prefetch) {
+  if (builtinID == AArch64::BI__prefetch ||
+      builtinID == AArch64::BI__prefetch2) {
     cgm.errorNYI(expr->getSourceRange(),
                  std::string("unimplemented AArch64 builtin call: ") +
                      getContext().BuiltinInfo.getName(builtinID));
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 9d8d9d7d97b49..d9ed09277fa0d 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -5296,6 +5296,26 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return Builder.CreateCall(F, {Address, RW, Locality, Data});
   }
 
+  if (BuiltinID == AArch64::BI__prefetch2) {
+    Value *Address = EmitScalarExpr(E->getArg(0));
+    llvm::APSInt PrfOp = E->getArg(1)->EvaluateKnownConstInt(CGM.getContext());
+    // Decode 5-bit PRFM encoding: bits[4:3]=type, bits[2:1]=target,
+    // bit[0]=policy
+    //   type: PLD=0(load), PLI=1(instr), PST=2(store)
+    //   target: L1=0, L2=1, L3=2
+    //   policy: KEEP=0, STRM=1
+    uint64_t Op = PrfOp.getZExtValue();
+    uint64_t Type = (Op >> 3) & 0x3;
+    uint64_t Target = (Op >> 1) & 0x3;
+    uint64_t Policy = Op & 0x1;
+    Value *RW = Builder.getInt32(Type == 2 ? 1 : 0);
+    Value *Local = Builder.getInt32(Target);
+    Value *IsStream = Builder.getInt32(Policy);
+    Value *IsData = Builder.getInt32(Type == 1 ? 0 : 1);
+    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_prefetch);
+    return Builder.CreateCall(F, {Address, RW, Local, IsStream, IsData});
+  }
+
   if (BuiltinID == AArch64::BI__hlt) {
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_hlt);
     Builder.CreateCall(F, {EmitScalarExpr(E->getArg(0))});
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index 7692d289fe90d..4cb8cac960bcf 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -450,6 +450,7 @@ unsigned int _CountTrailingZeros64(unsigned __int64);
 unsigned int __hlt(unsigned int, ...);
 
 void __cdecl __prefetch(const void *);
+void __cdecl __prefetch2(const void *, unsigned char);
 
 #endif
 
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 62f6d7296192d..d78baab661701 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -1178,6 +1178,9 @@ bool SemaARM::CheckAArch64BuiltinFunctionCall(const 
TargetInfo &TI,
       BuiltinID == AArch64::BI__getRegFp || BuiltinID == AArch64::BI__setRegFp)
     return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 31);
 
+  if (BuiltinID == AArch64::BI__prefetch2)
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31);
+
   if (BuiltinID == AArch64::BI__break)
     return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 0xffff);
 
diff --git a/clang/test/CodeGen/arm64-microsoft-intrinsics.c 
b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
index e20751806a60b..e6a415a0d8805 100644
--- a/clang/test/CodeGen/arm64-microsoft-intrinsics.c
+++ b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
@@ -657,6 +657,16 @@ void check__prefetch(void *arg1) {
 // CHECK-MSCOMPAT: call void @llvm.prefetch.p0(ptr %[[VAR0]], i32 0, i32 3, 
i32 1)
 // CHECK-MSCOMPAT: ret void
 
+void check__prefetch2(void *arg1) {
+  __prefetch2(arg1, 0x00);
+  __prefetch2(arg1, 0x13);
+}
+
+// CHECK-MSCOMPAT-LABEL: define{{.*}}void 
@check__prefetch2(ptr{{.*}}%arg1){{.*}}{
+// CHECK-MSCOMPAT: call void @llvm.aarch64.prefetch(ptr %{{.*}}, i32 0, i32 0, 
i32 0, i32 1)
+// CHECK-MSCOMPAT: call void @llvm.aarch64.prefetch(ptr %{{.*}}, i32 1, i32 1, 
i32 1, i32 1)
+// CHECK-LINUX: error: call to undeclared function '__prefetch2'
+
 
 // CHECK-MSCOMPAT: ![[MD2]] = !{!"x18"}
 // CHECK-MSCOMPAT: ![[MD3]] = !{!"sp"}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to