As per pr/48126 Michael Edwards spotted that in the case where the compare fails in the cmpxchg, the barrier at the end wasn't taken theoretically allowing a following load to float up above the load value compared.
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 057f9ba..39057d2 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -23531,8 +23626,8 @@ arm_output_sync_loop (emit_f emit, } } - arm_process_output_memory_barrier (emit, NULL); arm_output_asm_insn (emit, 1, operands, "%sLSYB%%=:", LOCAL_LABEL_PREFIX); + arm_process_output_memory_barrier (emit, NULL); } static rtx