On Tue, 6 Jun 2023 10:47:58 +0100 Peter Maydell <peter.mayd...@linaro.org> wrote:
> From: Richard Henderson <richard.hender...@linaro.org> > > Round len_align to 16 instead of 8, handling an odd 8-byte as part > of the tail. Use MO_ATOM_NONE to indicate that all of these memory > ops have only byte atomicity. > > Reviewed-by: Peter Maydell <peter.mayd...@linaro.org> > Signed-off-by: Richard Henderson <richard.hender...@linaro.org> > Message-id: 20230530191438.411344-8-richard.hender...@linaro.org > Signed-off-by: Peter Maydell <peter.mayd...@linaro.org> Early in debugging but a git bisect pointed at this patch causing: ERROR:../../tcg/tcg.c:4317:temp_load: code should not be reached Bail out! ERROR:../../tcg/tcg.c:4317:temp_load: code should not be reached Aborted Just after Run /sbin/init arm64 / virt running on an x86 host. cpu max. Just reverting this patch results in length check failures. I reverted as follows revert: Revert "target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve_{ld, st}r" revert: Revert "target/arm: Sink gen_mte_check1 into load/store_exclusive" revert: Revert "target/arm: Load/store integer pair with one tcg operation" revert: Revert "target/arm: Hoist finalize_memop out of do_gpr_{ld, st}" revert: Revert "target/arm: Hoist finalize_memop out of do_fp_{ld, st}" revert: Revert "target/arm: Pass memop to gen_mte_check1*" revert: Revert "target/arm: Pass single_memop to gen_mte_checkN" revert: Revert "target/arm: Check alignment in helper_mte_check" revert: Revert "target/arm: Add SCTLR.nAA to TBFLAG_A64" revert: Revert "target/arm: Relax ordered/atomic alignment checks for LSE2" revert: Revert "target/arm: Move mte check for store-exclusive" and all is good - obviously that's probably massive overkill. Jonathan > --- > target/arm/tcg/translate-sve.c | 95 +++++++++++++++++++++++++--------- > 1 file changed, 70 insertions(+), 25 deletions(-) > > diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c > index d9d5810ddea..57051a4729a 100644 > --- a/target/arm/tcg/translate-sve.c > +++ b/target/arm/tcg/translate-sve.c > @@ -4167,11 +4167,12 @@ TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz, > void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, > int len, int rn, int imm) > { > - int len_align = QEMU_ALIGN_DOWN(len, 8); > - int len_remain = len % 8; > - int nparts = len / 8 + ctpop8(len_remain); > + int len_align = QEMU_ALIGN_DOWN(len, 16); > + int len_remain = len % 16; > + int nparts = len / 16 + ctpop8(len_remain); > int midx = get_mem_index(s); > TCGv_i64 dirty_addr, clean_addr, t0, t1; > + TCGv_i128 t16; > > dirty_addr = tcg_temp_new_i64(); > tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm); > @@ -4188,10 +4189,16 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int > vofs, > int i; > > t0 = tcg_temp_new_i64(); > - for (i = 0; i < len_align; i += 8) { > - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ); > + t1 = tcg_temp_new_i64(); > + t16 = tcg_temp_new_i128(); > + > + for (i = 0; i < len_align; i += 16) { > + tcg_gen_qemu_ld_i128(t16, clean_addr, midx, > + MO_LE | MO_128 | MO_ATOM_NONE); > + tcg_gen_extr_i128_i64(t0, t1, t16); > tcg_gen_st_i64(t0, base, vofs + i); > - tcg_gen_addi_i64(clean_addr, clean_addr, 8); > + tcg_gen_st_i64(t1, base, vofs + i + 8); > + tcg_gen_addi_i64(clean_addr, clean_addr, 16); > } > } else { > TCGLabel *loop = gen_new_label(); > @@ -4200,14 +4207,21 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int > vofs, > tcg_gen_movi_ptr(i, 0); > gen_set_label(loop); > > - t0 = tcg_temp_new_i64(); > - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ); > - tcg_gen_addi_i64(clean_addr, clean_addr, 8); > + t16 = tcg_temp_new_i128(); > + tcg_gen_qemu_ld_i128(t16, clean_addr, midx, > + MO_LE | MO_128 | MO_ATOM_NONE); > + tcg_gen_addi_i64(clean_addr, clean_addr, 16); > > tp = tcg_temp_new_ptr(); > tcg_gen_add_ptr(tp, base, i); > - tcg_gen_addi_ptr(i, i, 8); > + tcg_gen_addi_ptr(i, i, 16); > + > + t0 = tcg_temp_new_i64(); > + t1 = tcg_temp_new_i64(); > + tcg_gen_extr_i128_i64(t0, t1, t16); > + > tcg_gen_st_i64(t0, tp, vofs); > + tcg_gen_st_i64(t1, tp, vofs + 8); > > tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); > } > @@ -4216,6 +4230,16 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int > vofs, > * Predicate register loads can be any multiple of 2. > * Note that we still store the entire 64-bit unit into cpu_env. > */ > + if (len_remain >= 8) { > + t0 = tcg_temp_new_i64(); > + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE); > + tcg_gen_st_i64(t0, base, vofs + len_align); > + len_remain -= 8; > + len_align += 8; > + if (len_remain) { > + tcg_gen_addi_i64(clean_addr, clean_addr, 8); > + } > + } > if (len_remain) { > t0 = tcg_temp_new_i64(); > switch (len_remain) { > @@ -4223,14 +4247,14 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int > vofs, > case 4: > case 8: > tcg_gen_qemu_ld_i64(t0, clean_addr, midx, > - MO_LE | ctz32(len_remain)); > + MO_LE | ctz32(len_remain) | MO_ATOM_NONE); > break; > > case 6: > t1 = tcg_temp_new_i64(); > - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL); > + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL | > MO_ATOM_NONE); > tcg_gen_addi_i64(clean_addr, clean_addr, 4); > - tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW); > + tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW | > MO_ATOM_NONE); > tcg_gen_deposit_i64(t0, t0, t1, 32, 32); > break; > > @@ -4245,11 +4269,12 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int > vofs, > void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, > int len, int rn, int imm) > { > - int len_align = QEMU_ALIGN_DOWN(len, 8); > - int len_remain = len % 8; > - int nparts = len / 8 + ctpop8(len_remain); > + int len_align = QEMU_ALIGN_DOWN(len, 16); > + int len_remain = len % 16; > + int nparts = len / 16 + ctpop8(len_remain); > int midx = get_mem_index(s); > - TCGv_i64 dirty_addr, clean_addr, t0; > + TCGv_i64 dirty_addr, clean_addr, t0, t1; > + TCGv_i128 t16; > > dirty_addr = tcg_temp_new_i64(); > tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm); > @@ -4267,10 +4292,15 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int > vofs, > int i; > > t0 = tcg_temp_new_i64(); > + t1 = tcg_temp_new_i64(); > + t16 = tcg_temp_new_i128(); > for (i = 0; i < len_align; i += 8) { > tcg_gen_ld_i64(t0, base, vofs + i); > - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ); > - tcg_gen_addi_i64(clean_addr, clean_addr, 8); > + tcg_gen_ld_i64(t1, base, vofs + i + 8); > + tcg_gen_concat_i64_i128(t16, t0, t1); > + tcg_gen_qemu_st_i128(t16, clean_addr, midx, > + MO_LE | MO_128 | MO_ATOM_NONE); > + tcg_gen_addi_i64(clean_addr, clean_addr, 16); > } > } else { > TCGLabel *loop = gen_new_label(); > @@ -4280,18 +4310,33 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int > vofs, > gen_set_label(loop); > > t0 = tcg_temp_new_i64(); > + t1 = tcg_temp_new_i64(); > tp = tcg_temp_new_ptr(); > tcg_gen_add_ptr(tp, base, i); > tcg_gen_ld_i64(t0, tp, vofs); > - tcg_gen_addi_ptr(i, i, 8); > + tcg_gen_ld_i64(t1, tp, vofs + 8); > + tcg_gen_addi_ptr(i, i, 16); > > - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ); > - tcg_gen_addi_i64(clean_addr, clean_addr, 8); > + t16 = tcg_temp_new_i128(); > + tcg_gen_concat_i64_i128(t16, t0, t1); > + > + tcg_gen_qemu_st_i128(t16, clean_addr, midx, MO_LEUQ); > + tcg_gen_addi_i64(clean_addr, clean_addr, 16); > > tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); > } > > /* Predicate register stores can be any multiple of 2. */ > + if (len_remain >= 8) { > + t0 = tcg_temp_new_i64(); > + tcg_gen_st_i64(t0, base, vofs + len_align); > + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE); > + len_remain -= 8; > + len_align += 8; > + if (len_remain) { > + tcg_gen_addi_i64(clean_addr, clean_addr, 8); > + } > + } > if (len_remain) { > t0 = tcg_temp_new_i64(); > tcg_gen_ld_i64(t0, base, vofs + len_align); > @@ -4301,14 +4346,14 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int > vofs, > case 4: > case 8: > tcg_gen_qemu_st_i64(t0, clean_addr, midx, > - MO_LE | ctz32(len_remain)); > + MO_LE | ctz32(len_remain) | MO_ATOM_NONE); > break; > > case 6: > - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL); > + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL | > MO_ATOM_NONE); > tcg_gen_addi_i64(clean_addr, clean_addr, 4); > tcg_gen_shri_i64(t0, t0, 32); > - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW); > + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW | > MO_ATOM_NONE); > break; > > default: