Two latent bugs in the aarch64 FPU state plumbing, both surfaced
the first time an aarch64 thread_get_state / thread_set_state
caller exercised AARCH64_FLOAT_STATE.
1. Field-order bug in _fpu_save_state / _fpu_load_state
============================================================
struct aarch64_float_state in <mach/aarch64/thread_status.h> lays
out fpsr at the first offset after v[32], with fpcr second:
struct aarch64_float_state {
__int128 v[32];
uint64_t fpsr; /* offset +512 */
uint64_t fpcr; /* offset +520 */
uint64_t fpmr;
uint64_t fp_reserved;
};
The save/load asm in aarch64/aarch64/locore.S read and wrote them in
the opposite order — store FPCR into the fpsr slot and FPSR into the
fpcr slot — which silently corrupted both fields across every
thread_get_state / thread_set_state cycle.
A caller that loads a known FPCR via `msr fpcr, ...` and reads it
back through thread_get_state(AARCH64_FLOAT_STATE) sees 0 instead,
because the value was stored at the fpsr offset.
2. Alignment-check bug in thread_setstatus
============================================================
thread_setstatus() in aarch64/aarch64/pcb.c rejected
AARCH64_FLOAT_STATE writes whose tstate pointer wasn't aligned to
alignof(struct aarch64_float_state). The struct's first member is
__int128 v[32], giving the type a 16-byte alignment requirement.
The MIG-generated _Xthread_set_state stub places `new_state[]` at
offset 40 from the request message header, which is only 8-byte
aligned. Even when the user-side buffer is perfectly aligned, the
in-kernel buffer the stub hands to thread_setstatus is at the
fixed offset 40 — so the alignment check rejected every legitimate
AARCH64_FLOAT_STATE write with KERN_INVALID_ARGUMENT.
Copy the state into a stack-local that has the natural type
alignment before validating and storing. This is also what would
have to happen if we wanted to read __int128 from the buffer
directly on a strict-alignment-trapping config; the byte-by-byte
copy handles the under-aligned source without faulting.
---
aarch64/aarch64/locore.S | 16 ++++++++++++----
aarch64/aarch64/pcb.c | 24 +++++++++++++++++++-----
2 files changed, 31 insertions(+), 9 deletions(-)
diff --git a/aarch64/aarch64/locore.S b/aarch64/aarch64/locore.S
index 008e39cc..8581a3b6 100644
--- a/aarch64/aarch64/locore.S
+++ b/aarch64/aarch64/locore.S
@@ -406,8 +406,12 @@ ENTRY(_fpu_save_state)
stp q26, q27, [x0], #32
stp q28, q29, [x0], #32
stp q30, q31, [x0], #32
- mrs x1, fpcr
- mrs x2, fpsr
+ /*
+ * struct aarch64_float_state lays out fpsr at offset 0 from
+ * here and fpcr at offset 8, so save in that order.
+ */
+ mrs x1, fpsr
+ mrs x2, fpcr
stp x1, x2, [x0], #32
stp xzr, xzr, [x0] /* FPMR ignored for now */
ret
@@ -434,9 +438,13 @@ ENTRY(_fpu_load_state)
ldp q28, q29, [x0], #32
ldp q30, q31, [x0], #32
+ /*
+ * struct aarch64_float_state lays out fpsr first at offset 0,
+ * then fpcr at offset 8 — mirror the save order above.
+ */
ldp x1, x2, [x0]
- msr fpcr, x1
- msr fpsr, x2
+ msr fpsr, x1
+ msr fpcr, x2
/* FPMR ignored for now */
ret
END(_fpu_load_state)
diff --git a/aarch64/aarch64/pcb.c b/aarch64/aarch64/pcb.c
index 0443ce4e..33074c8f 100644
--- a/aarch64/aarch64/pcb.c
+++ b/aarch64/aarch64/pcb.c
@@ -324,12 +324,25 @@ kern_return_t thread_setstatus(
memcpy(USER_REGS(thread), ats, sizeof(struct
aarch64_thread_state));
return KERN_SUCCESS;
- case AARCH64_FLOAT_STATE:
+ case AARCH64_FLOAT_STATE: {
+ /*
+ * MIG places `new_state[]` at offset 40 from the
+ * Request message header, which is 8-byte aligned
+ * but not 16-byte aligned. alignof(struct
+ * aarch64_float_state) is 16 (from the __int128
+ * v[32] member), so the state pointer the MIG stub
+ * hands us is intrinsically under-aligned even
+ * though the buffer is structurally valid. Copy
+ * into an aligned local before validating/storing
+ * so misaligned __int128 access doesn't matter.
+ */
+ struct aarch64_float_state aligned;
+
if (count < AARCH64_FLOAT_STATE_COUNT)
return KERN_INVALID_ARGUMENT;
- if (((vm_offset_t) tstate) % alignof(struct
aarch64_float_state))
- return KERN_INVALID_ARGUMENT;
- afs = (struct aarch64_float_state *) tstate;
+
+ memcpy(&aligned, tstate, sizeof(aligned));
+ afs = &aligned;
if (!validate_fpcr(afs->fpcr, old_fpr(thread, fpcr)))
return KERN_INVALID_ARGUMENT;
@@ -339,8 +352,9 @@ kern_return_t thread_setstatus(
return KERN_INVALID_ARGUMENT;
fpu_flush_state_write(thread);
- memcpy(thread->pcb->afs, tstate, sizeof(struct
aarch64_float_state));
+ memcpy(thread->pcb->afs, afs, sizeof(struct
aarch64_float_state));
return KERN_SUCCESS;
+ }
default:
return KERN_INVALID_ARGUMENT;
--
2.54.0