Two latent bugs in the aarch64 FPU state plumbing, both surfaced
the first time an aarch64 thread_get_state / thread_set_state
caller exercised AARCH64_FLOAT_STATE.

1. Field-order bug in _fpu_save_state / _fpu_load_state
============================================================

struct aarch64_float_state in <mach/aarch64/thread_status.h> lays
out fpsr at the first offset after v[32], with fpcr second:

    struct aarch64_float_state {
        __int128 v[32];
        uint64_t fpsr;          /* offset +512 */
        uint64_t fpcr;          /* offset +520 */
        uint64_t fpmr;
        uint64_t fp_reserved;
    };

The save/load asm in aarch64/aarch64/locore.S read and wrote them in
the opposite order — store FPCR into the fpsr slot and FPSR into the
fpcr slot — which silently corrupted both fields across every
thread_get_state / thread_set_state cycle.

A caller that loads a known FPCR via `msr fpcr, ...` and reads it
back through thread_get_state(AARCH64_FLOAT_STATE) sees 0 instead,
because the value was stored at the fpsr offset.

2. Alignment-check bug in thread_setstatus
============================================================

thread_setstatus() in aarch64/aarch64/pcb.c rejected
AARCH64_FLOAT_STATE writes whose tstate pointer wasn't aligned to
alignof(struct aarch64_float_state).  The struct's first member is
__int128 v[32], giving the type a 16-byte alignment requirement.

The MIG-generated _Xthread_set_state stub places `new_state[]` at
offset 40 from the request message header, which is only 8-byte
aligned.  Even when the user-side buffer is perfectly aligned, the
in-kernel buffer the stub hands to thread_setstatus is at the
fixed offset 40 — so the alignment check rejected every legitimate
AARCH64_FLOAT_STATE write with KERN_INVALID_ARGUMENT.

Copy the state into a stack-local that has the natural type
alignment before validating and storing.  This is also what would
have to happen if we wanted to read __int128 from the buffer
directly on a strict-alignment-trapping config; the byte-by-byte
copy handles the under-aligned source without faulting.
---
 aarch64/aarch64/locore.S | 16 ++++++++++++----
 aarch64/aarch64/pcb.c    | 24 +++++++++++++++++++-----
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/aarch64/aarch64/locore.S b/aarch64/aarch64/locore.S
index 008e39cc..8581a3b6 100644
--- a/aarch64/aarch64/locore.S
+++ b/aarch64/aarch64/locore.S
@@ -406,8 +406,12 @@ ENTRY(_fpu_save_state)
        stp     q26, q27, [x0], #32
        stp     q28, q29, [x0], #32
        stp     q30, q31, [x0], #32
-       mrs     x1, fpcr
-       mrs     x2, fpsr
+       /*
+        * struct aarch64_float_state lays out fpsr at offset 0 from
+        * here and fpcr at offset 8, so save in that order.
+        */
+       mrs     x1, fpsr
+       mrs     x2, fpcr
        stp     x1, x2, [x0], #32
        stp     xzr, xzr, [x0]          /* FPMR ignored for now */
        ret
@@ -434,9 +438,13 @@ ENTRY(_fpu_load_state)
        ldp     q28, q29, [x0], #32
        ldp     q30, q31, [x0], #32
 
+       /*
+        * struct aarch64_float_state lays out fpsr first at offset 0,
+        * then fpcr at offset 8 — mirror the save order above.
+        */
        ldp     x1, x2, [x0]
-       msr     fpcr, x1
-       msr     fpsr, x2
+       msr     fpsr, x1
+       msr     fpcr, x2
        /* FPMR ignored for now */
        ret
 END(_fpu_load_state)
diff --git a/aarch64/aarch64/pcb.c b/aarch64/aarch64/pcb.c
index 0443ce4e..33074c8f 100644
--- a/aarch64/aarch64/pcb.c
+++ b/aarch64/aarch64/pcb.c
@@ -324,12 +324,25 @@ kern_return_t thread_setstatus(
                        memcpy(USER_REGS(thread), ats, sizeof(struct 
aarch64_thread_state));
                        return KERN_SUCCESS;
 
-               case AARCH64_FLOAT_STATE:
+               case AARCH64_FLOAT_STATE: {
+                       /*
+                        * MIG places `new_state[]` at offset 40 from the
+                        * Request message header, which is 8-byte aligned
+                        * but not 16-byte aligned.  alignof(struct
+                        * aarch64_float_state) is 16 (from the __int128
+                        * v[32] member), so the state pointer the MIG stub
+                        * hands us is intrinsically under-aligned even
+                        * though the buffer is structurally valid.  Copy
+                        * into an aligned local before validating/storing
+                        * so misaligned __int128 access doesn't matter.
+                        */
+                       struct aarch64_float_state      aligned;
+
                        if (count < AARCH64_FLOAT_STATE_COUNT)
                                return KERN_INVALID_ARGUMENT;
-                       if (((vm_offset_t) tstate) % alignof(struct 
aarch64_float_state))
-                               return KERN_INVALID_ARGUMENT;
-                       afs = (struct aarch64_float_state *) tstate;
+
+                       memcpy(&aligned, tstate, sizeof(aligned));
+                       afs = &aligned;
 
                        if (!validate_fpcr(afs->fpcr, old_fpr(thread, fpcr)))
                                return KERN_INVALID_ARGUMENT;
@@ -339,8 +352,9 @@ kern_return_t thread_setstatus(
                                return KERN_INVALID_ARGUMENT;
 
                        fpu_flush_state_write(thread);
-                       memcpy(thread->pcb->afs, tstate, sizeof(struct 
aarch64_float_state));
+                       memcpy(thread->pcb->afs, afs, sizeof(struct 
aarch64_float_state));
                        return KERN_SUCCESS;
+               }
 
                default:
                        return KERN_INVALID_ARGUMENT;
-- 
2.54.0


Reply via email to