On Wed, Oct 22, 2025 at 3:18 AM Richard Henderson <[email protected]> wrote: > > For Arm, we need 3 cases: (1) the alignment required when accessing > Normal memory, (2) the alignment required when accessing Device memory, > and (3) the atomicity of the access. > > When we added TLB_CHECK_ALIGNED, we assumed that cases 2 and 3 were > identical, and thus used memop_atomicity_bits for TLB_CHECK_ALIGNED. > > This is incorrect for multiple reasons, including that the atomicity > of the access is adjusted depending on whether or not we are executing > within a serial context. > > For Arm, what is true is that there is an underlying alignment > requirement of the access, and for that access Normal memory > will support unalignement. > > Introduce MO_ALIGN_TLB_ONLY to indicate that the alignment > specified in MO_AMASK only applies when the TLB entry has > TLB_CHECK_ALIGNED set; otherwise no alignment required. > > Introduce memop_tlb_alignment_bits with an additional bool > argument that specifies whether TLB_CHECK_ALIGNED is set. > All other usage of memop_alignment_bits assumes it is not. > > Remove memop_atomicity_bits as unused; it didn't properly > support MO_ATOM_SUBWORD anyway. > > Update target/arm finalize_memop_atom to set MO_ALIGN_TLB_ONLY > when strict alignment isn't otherwise required. > > Suggested-by: Peter Maydell <[email protected]> > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/3171 > Signed-off-by: Richard Henderson <[email protected]> > ---
Reviewed-by: Manos Pitsidianakis <[email protected]> > > PS: There are a number of uses of align_mem for AdvSIMD, SVE and SME. > I have not re-familiarized myself with the effects of SCR.A and > Normal/Device memory for those cases. I may well have missed something. > > --- > include/exec/memop.h | 43 +++++++++++++++------------------ > target/arm/tcg/translate.h | 4 +-- > accel/tcg/cputlb.c | 13 +--------- > target/arm/ptw.c | 2 +- > target/arm/tcg/translate-a64.c | 10 +++----- > target/arm/tcg/translate-neon.c | 2 +- > tcg/tcg.c | 10 +++++--- > 7 files changed, 35 insertions(+), 49 deletions(-) > > diff --git a/include/exec/memop.h b/include/exec/memop.h > index cf7da3362e..799b5b4221 100644 > --- a/include/exec/memop.h > +++ b/include/exec/memop.h > @@ -72,6 +72,16 @@ typedef enum MemOp { > MO_ALIGN_64 = 6 << MO_ASHIFT, > MO_ALIGN = MO_AMASK, > > + /* > + * MO_ALIGN_TLB_ONLY: > + * Apply MO_AMASK only along the TCG slow path if TLB_CHECK_ALIGNED > + * is set; otherwise unaligned access is permitted. > + * This is used by target/arm, where unaligned accesses are > + * permitted for pages marked Normal but aligned accesses are > + * required for pages marked Device. > + */ > + MO_ALIGN_TLB_ONLY = 1 << 8, > + > /* > * MO_ATOM_* describes the atomicity requirements of the operation: > * MO_ATOM_IFALIGN: the operation must be single-copy atomic if it > @@ -104,7 +114,7 @@ typedef enum MemOp { > * size of the operation, if aligned. This retains the behaviour > * from before this field was introduced. > */ > - MO_ATOM_SHIFT = 8, > + MO_ATOM_SHIFT = 9, > MO_ATOM_IFALIGN = 0 << MO_ATOM_SHIFT, > MO_ATOM_IFALIGN_PAIR = 1 << MO_ATOM_SHIFT, > MO_ATOM_WITHIN16 = 2 << MO_ATOM_SHIFT, > @@ -169,16 +179,16 @@ static inline MemOp size_memop(unsigned size) > } > > /** > - * memop_alignment_bits: > + * memop_tlb_alignment_bits: > * @memop: MemOp value > * > - * Extract the alignment size from the memop. > + * Extract the alignment size for use with TLB_CHECK_ALIGNED. > */ > -static inline unsigned memop_alignment_bits(MemOp memop) > +static inline unsigned memop_tlb_alignment_bits(MemOp memop, bool tlb_check) > { > unsigned a = memop & MO_AMASK; > > - if (a == MO_UNALN) { > + if (a == MO_UNALN || (!tlb_check && (memop & MO_ALIGN_TLB_ONLY))) { > /* No alignment required. */ > a = 0; > } else if (a == MO_ALIGN) { > @@ -191,28 +201,15 @@ static inline unsigned memop_alignment_bits(MemOp memop) > return a; > } > > -/* > - * memop_atomicity_bits: > +/** > + * memop_alignment_bits: > * @memop: MemOp value > * > - * Extract the atomicity size from the memop. > + * Extract the alignment size from the memop. > */ > -static inline unsigned memop_atomicity_bits(MemOp memop) > +static inline unsigned memop_alignment_bits(MemOp memop) > { > - unsigned size = memop & MO_SIZE; > - > - switch (memop & MO_ATOM_MASK) { > - case MO_ATOM_NONE: > - size = MO_8; > - break; > - case MO_ATOM_IFALIGN_PAIR: > - case MO_ATOM_WITHIN16_PAIR: > - size = size ? size - 1 : 0; > - break; > - default: > - break; > - } > - return size; > + return memop_tlb_alignment_bits(memop, false); > } > > #endif > diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h > index 9a85ea74db..b62104b4ae 100644 > --- a/target/arm/tcg/translate.h > +++ b/target/arm/tcg/translate.h > @@ -744,8 +744,8 @@ static inline TCGv_ptr fpstatus_ptr(ARMFPStatusFlavour > flavour) > */ > static inline MemOp finalize_memop_atom(DisasContext *s, MemOp opc, MemOp > atom) > { > - if (s->align_mem && !(opc & MO_AMASK)) { > - opc |= MO_ALIGN; > + if (!(opc & MO_AMASK)) { > + opc |= MO_ALIGN | (s->align_mem ? 0 : MO_ALIGN_TLB_ONLY); > } > return opc | atom | s->be_data; > } > diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c > index 631f1fe135..fd1606c856 100644 > --- a/accel/tcg/cputlb.c > +++ b/accel/tcg/cputlb.c > @@ -1668,18 +1668,7 @@ static bool mmu_lookup1(CPUState *cpu, > MMULookupPageData *data, MemOp memop, > > if (likely(!maybe_resized)) { > /* Alignment has not been checked by tlb_fill_align. */ > - int a_bits = memop_alignment_bits(memop); > - > - /* > - * This alignment check differs from the one above, in that this is > - * based on the atomicity of the operation. The intended use case is > - * the ARM memory type field of each PTE, where access to pages with > - * Device memory type require alignment. > - */ > - if (unlikely(flags & TLB_CHECK_ALIGNED)) { > - int at_bits = memop_atomicity_bits(memop); > - a_bits = MAX(a_bits, at_bits); > - } > + int a_bits = memop_tlb_alignment_bits(memop, flags & > TLB_CHECK_ALIGNED); > if (unlikely(addr & ((1 << a_bits) - 1))) { > cpu_unaligned_access(cpu, addr, access_type, mmu_idx, ra); > } > diff --git a/target/arm/ptw.c b/target/arm/ptw.c > index d4386ede73..939215d096 100644 > --- a/target/arm/ptw.c > +++ b/target/arm/ptw.c > @@ -2351,7 +2351,7 @@ static bool get_phys_addr_lpae(CPUARMState *env, > S1Translate *ptw, > * CPUs with ARM_FEATURE_LPAE but not ARM_FEATURE_V7VE anyway.) > */ > if (device) { > - unsigned a_bits = memop_atomicity_bits(memop); > + unsigned a_bits = memop_tlb_alignment_bits(memop, true); > if (address & ((1 << a_bits) - 1)) { > fi->type = ARMFault_Alignment; > goto do_fault; > diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c > index 3292d7cbfd..08b21d7dbf 100644 > --- a/target/arm/tcg/translate-a64.c > +++ b/target/arm/tcg/translate-a64.c > @@ -3691,9 +3691,8 @@ static bool trans_STP(DisasContext *s, arg_ldstpair *a) > * In all cases, issue one operation with the correct atomicity. > */ > mop = a->sz + 1; > - if (s->align_mem) { > - mop |= (a->sz == 2 ? MO_ALIGN_4 : MO_ALIGN_8); > - } > + mop |= (a->sz == 2 ? MO_ALIGN_4 : MO_ALIGN_8); > + mop |= (s->align_mem ? 0 : MO_ALIGN_TLB_ONLY); > mop = finalize_memop_pair(s, mop); > if (a->sz == 2) { > TCGv_i64 tmp = tcg_temp_new_i64(); > @@ -3742,9 +3741,8 @@ static bool trans_LDP(DisasContext *s, arg_ldstpair *a) > * since that reuses the most code below. > */ > mop = a->sz + 1; > - if (s->align_mem) { > - mop |= (a->sz == 2 ? MO_ALIGN_4 : MO_ALIGN_8); > - } > + mop |= (a->sz == 2 ? MO_ALIGN_4 : MO_ALIGN_8); > + mop |= (s->align_mem ? 0 : MO_ALIGN_TLB_ONLY); > mop = finalize_memop_pair(s, mop); > if (a->sz == 2) { > int o2 = s->be_data == MO_LE ? 32 : 0; > diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c > index 844d2e29e4..e3c7d9217b 100644 > --- a/target/arm/tcg/translate-neon.c > +++ b/target/arm/tcg/translate-neon.c > @@ -520,7 +520,7 @@ static bool trans_VLDST_multiple(DisasContext *s, > arg_VLDST_multiple *a) > if (a->align) { > align = pow2_align(a->align + 2); /* 4 ** a->align */ > } else { > - align = s->align_mem ? MO_ALIGN : 0; > + align = MO_ALIGN | (s->align_mem ? 0 : MO_ALIGN_TLB_ONLY); > } > > /* > diff --git a/tcg/tcg.c b/tcg/tcg.c > index 294762c283..fbf09f5c82 100644 > --- a/tcg/tcg.c > +++ b/tcg/tcg.c > @@ -3039,20 +3039,22 @@ void tcg_dump_ops(TCGContext *s, FILE *f, bool > have_prefs) > case INDEX_op_qemu_ld2: > case INDEX_op_qemu_st2: > { > - const char *s_al, *s_op, *s_at; > + const char *s_al, *s_tlb, *s_op, *s_at; > MemOpIdx oi = op->args[k++]; > MemOp mop = get_memop(oi); > unsigned ix = get_mmuidx(oi); > > + s_tlb = mop & MO_ALIGN_TLB_ONLY ? "tlb+" : ""; > s_al = alignment_name[(mop & MO_AMASK) >> MO_ASHIFT]; > s_op = ldst_name[mop & (MO_BSWAP | MO_SSIZE)]; > s_at = atom_name[(mop & MO_ATOM_MASK) >> MO_ATOM_SHIFT]; > - mop &= ~(MO_AMASK | MO_BSWAP | MO_SSIZE | MO_ATOM_MASK); > + mop &= ~(MO_AMASK | MO_BSWAP | MO_SSIZE | > + MO_ATOM_MASK | MO_ALIGN_TLB_ONLY); > > /* If all fields are accounted for, print symbolically. > */ > if (!mop && s_al && s_op && s_at) { > - col += ne_fprintf(f, ",%s%s%s,%u", > - s_at, s_al, s_op, ix); > + col += ne_fprintf(f, ",%s%s%s%s,%u", > + s_at, s_al, s_tlb, s_op, ix); > } else { > mop = get_memop(oi); > col += ne_fprintf(f, ",$0x%x,%u", mop, ix); > -- > 2.43.0 > >
