Signed-off-by: Richard Henderson <[email protected]>
---
target/arm/tcg/helper-fp8-defs.h | 2 ++
target/arm/tcg/fp8_helper.c | 59 ++++++++++++++++++++++++++++++++
target/arm/tcg/translate-sme.c | 3 ++
target/arm/tcg/sme.decode | 3 ++
4 files changed, 67 insertions(+)
diff --git a/target/arm/tcg/helper-fp8-defs.h b/target/arm/tcg/helper-fp8-defs.h
index 5863a6dbb8..36ae977431 100644
--- a/target/arm/tcg/helper-fp8-defs.h
+++ b/target/arm/tcg/helper-fp8-defs.h
@@ -21,3 +21,5 @@ DEF_HELPER_FLAGS_4(sve2_fcvtn_bh, TCG_CALL_NO_RWG, void, ptr,
ptr, env, i32)
DEF_HELPER_FLAGS_5(advsimd_fcvt_bs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env,
i32)
DEF_HELPER_FLAGS_4(sve2_fcvtnb_bs, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_4(sve2_fcvtnt_bs, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_fcvt_bs, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_fcvtn_bs, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
diff --git a/target/arm/tcg/fp8_helper.c b/target/arm/tcg/fp8_helper.c
index e1d255f864..931ea54cb6 100644
--- a/target/arm/tcg/fp8_helper.c
+++ b/target/arm/tcg/fp8_helper.c
@@ -512,3 +512,62 @@ void HELPER(sve2_fcvtnt_bs)(void *vd, void *vn,
CPUARMState *env, uint32_t desc)
fp8_finish(env, &ctx);
}
+
+void HELPER(sme2_fcvt_bs)(void *vd, void *vn, CPUARMState *env, uint32_t desc)
+{
+ ARMVectorReg scratch[4];
+ FP8Context ctx = fp8_dst_start(env, desc, false);
+ fcvt_fp8_output_fn *output_fmt = fcvt_fp8_output_fmt[ctx.f8fmt];
+ uint32_t *n = vn;
+ uint8_t *d = vd;
+ bool osc = FIELD_EX64(env->vfp.fpmr, FPMR, OSC);
+ size_t oprsz = simd_oprsz(desc);
+ size_t nelem = oprsz / 4;
+ size_t stride = sizeof(ARMVectorReg) / 4;
+
+ if (vectors_overlap(vd, 1, vn, 4)) {
+ n = memcpy(scratch, vn, sizeof(scratch));
+ }
+
+ for (size_t i = 0; i < nelem; i++) {
+ for (size_t j = 0; j < 4; j++) {
+ d[H1(i + nelem * j)] = fcvt_f32_to_fp8(n[H4(i) + stride * j],
+ output_fmt, ctx.scale,
+ osc, &ctx.stat);
+ }
+ }
+
+ fp8_finish(env, &ctx);
+}
+
+void HELPER(sme2_fcvtn_bs)(void *vd, void *vn, CPUARMState *env, uint32_t desc)
+{
+ FP8Context ctx = fp8_dst_start(env, desc, false);
+ fcvt_fp8_output_fn *output_fmt = fcvt_fp8_output_fmt[ctx.f8fmt];
+ uint32_t *n0 = vn;
+ uint32_t *n1 = vn + sizeof(ARMVectorReg);
+ uint32_t *n2 = vn + sizeof(ARMVectorReg) * 2;
+ uint32_t *n3 = vn + sizeof(ARMVectorReg) * 3;
+ uint8_t *d = vd;
+ bool osc = FIELD_EX64(env->vfp.fpmr, FPMR, OSC);
+ size_t oprsz = simd_oprsz(desc);
+ size_t nelem = oprsz / 4;
+
+ for (size_t i = 0; i < nelem; ++i) {
+ float32 e0 = n0[H4(i)];
+ float32 e1 = n1[H4(i)];
+ float32 e2 = n2[H4(i)];
+ float32 e3 = n3[H4(i)];
+
+ d[H1(4 * i + 0)] = fcvt_f32_to_fp8(e0, output_fmt,
+ ctx.scale, osc, &ctx.stat);
+ d[H1(4 * i + 1)] = fcvt_f32_to_fp8(e1, output_fmt,
+ ctx.scale, osc, &ctx.stat);
+ d[H1(4 * i + 2)] = fcvt_f32_to_fp8(e2, output_fmt,
+ ctx.scale, osc, &ctx.stat);
+ d[H1(4 * i + 3)] = fcvt_f32_to_fp8(e3, output_fmt,
+ ctx.scale, osc, &ctx.stat);
+ }
+
+ fp8_finish(env, &ctx);
+}
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
index 050c3cfefe..2f79c458e1 100644
--- a/target/arm/tcg/translate-sme.c
+++ b/target/arm/tcg/translate-sme.c
@@ -1572,6 +1572,9 @@ static bool trans_FCVT_bh(DisasContext *s, arg_zz_n *a)
return true;
}
+TRANS_FEAT(FCVT_bs, aa64_sme2_f8cvt, do_f8cvt, a, gen_helper_sme2_fcvt_bs, 0)
+TRANS_FEAT(FCVTN_bs, aa64_sme2_f8cvt, do_f8cvt, a, gen_helper_sme2_fcvtn_bs, 0)
+
static bool do_zipuzp_4(DisasContext *s, arg_zz_e *a,
gen_helper_gvec_2 * const fn[5])
{
diff --git a/target/arm/tcg/sme.decode b/target/arm/tcg/sme.decode
index a02bcc0e22..2b9e41a75a 100644
--- a/target/arm/tcg/sme.decode
+++ b/target/arm/tcg/sme.decode
@@ -865,6 +865,9 @@ BF2CVTL 11000001 111 00110 111000 ..... ....1
@zz_2x1
FCVT_bh 11000001 001 00100 111000 ....0 ..... @zz_1x2
+FCVT_bs 11000001 001 10100 111000 ...00 ..... @zz_1x4
+FCVTN_bs 11000001 001 10100 111000 ...01 ..... @zz_1x4
+
ZIP_4 11000001 esz:2 1 10110 111000 ...00 ... 00 \
&zz_e zd=%zd_ax4 zn=%zn_ax4
ZIP_4 11000001 001 10111 111000 ...00 ... 00 \
--
2.43.0