Signed-off-by: Richard Henderson <[email protected]>
---
target/arm/cpu-features.h | 5 ++
target/arm/tcg/helper-fp8-defs.h | 3 +
target/arm/tcg/fp8_helper.c | 113 +++++++++++++++++++++++++++++++
target/arm/tcg/translate-a64.c | 16 +++++
target/arm/tcg/a64.decode | 8 +++
5 files changed, 145 insertions(+)
diff --git a/target/arm/cpu-features.h b/target/arm/cpu-features.h
index 007e656ed4..ee20d74164 100644
--- a/target/arm/cpu-features.h
+++ b/target/arm/cpu-features.h
@@ -1590,6 +1590,11 @@ static inline bool isar_feature_aa64_f8cvt(const
ARMISARegisters *id)
return FIELD_EX64_IDREG(id, ID_AA64FPFR0, F8CVT);
}
+static inline bool isar_feature_aa64_f8fma(const ARMISARegisters *id)
+{
+ return FIELD_EX64_IDREG(id, ID_AA64FPFR0, F8FMA);
+}
+
/*
* Combinations of feature tests, for ease of use with TRANS_FEAT.
*/
diff --git a/target/arm/tcg/helper-fp8-defs.h b/target/arm/tcg/helper-fp8-defs.h
index 36ae977431..7aa8366d94 100644
--- a/target/arm/tcg/helper-fp8-defs.h
+++ b/target/arm/tcg/helper-fp8-defs.h
@@ -23,3 +23,6 @@ DEF_HELPER_FLAGS_4(sve2_fcvtnb_bs, TCG_CALL_NO_RWG, void,
ptr, ptr, env, i32)
DEF_HELPER_FLAGS_4(sve2_fcvtnt_bs, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_4(sme2_fcvt_bs, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_4(sme2_fcvtn_bs, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fmla_hb, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, env,
i32)
+DEF_HELPER_FLAGS_5(gvec_fmla_idx_hb, TCG_CALL_NO_RWG, void, ptr, ptr, ptr,
env, i32)
diff --git a/target/arm/tcg/fp8_helper.c b/target/arm/tcg/fp8_helper.c
index 931ea54cb6..c873de65cd 100644
--- a/target/arm/tcg/fp8_helper.c
+++ b/target/arm/tcg/fp8_helper.c
@@ -571,3 +571,116 @@ void HELPER(sme2_fcvtn_bs)(void *vd, void *vn,
CPUARMState *env, uint32_t desc)
fp8_finish(env, &ctx);
}
+
+typedef struct FP8MulContext {
+ float_status stat;
+ fp8_input_fn *fmt1;
+ fp8_input_fn *fmt2;
+ int scale;
+} FP8MulContext;
+
+static FP8MulContext fp8_mul_start(CPUARMState *env, int scale_mask)
+{
+ uint64_t fpmr = env->vfp.fpmr;
+
+ FP8MulContext ret = {
+ .stat = env->vfp.fp_status[FPST_A64],
+ .fmt1 = fp8_input_fmt[FIELD_EX64(fpmr, FPMR, F8S1)],
+ .fmt2 = fp8_input_fmt[FIELD_EX64(fpmr, FPMR, F8S2)],
+ .scale = -(FIELD_EX64(fpmr, FPMR, LSCALE) & scale_mask),
+ };
+
+ set_flush_to_zero(0, &ret.stat);
+ set_flush_inputs_to_zero(0, &ret.stat);
+ set_default_nan_mode(true, &ret.stat);
+ set_float_rounding_mode(FIELD_EX64(fpmr, FPMR, OSM)
+ ? float_round_nearest_even_max
+ : float_round_nearest_even, &ret.stat);
+
+ return ret;
+}
+
+static void fp8_mul_finish(CPUARMState *env, FP8MulContext *c)
+{
+ /*
+ * FP8 multiplies don't update FPSR.{IDC,IOC,IXC,UFC}.
+ * Since this is multiply-add, DZC does not apply and only OFC remains.
+ */
+ fp8_finish_fpst(&env->vfp.fp_status[FPST_A64], &c->stat,
+ float_flag_overflow);
+}
+
+static FloatParts64 f8dot(uint64_t a, uint64_t b, int n, FP8MulContext *ctx)
+{
+ /*
+ * Because of default_nan_mode, NaNs need no special handling.
+ * We'll simply get the default NaN out at the end of the sequence.
+ */
+ FloatParts64 p0 = ctx->fmt1(a & 0xff, &ctx->stat);
+ FloatParts64 p1 = ctx->fmt2(b & 0xff, &ctx->stat);
+ FloatParts64 pr = parts64_mul(&p0, &p1, &ctx->stat);
+
+ for (int i = 1; i < n; ++i) {
+ p0 = ctx->fmt1(extract64(a, i * 8, 8), &ctx->stat);
+ p1 = ctx->fmt2(extract64(b, i * 8, 8), &ctx->stat);
+ pr = parts64_muladd(&p0, &p1, &pr, 0, &ctx->stat);
+ }
+ return parts64_scalbn(&pr, ctx->scale, &ctx->stat);
+}
+
+static float16 f8dotadd_h(uint64_t a, uint64_t b, int n, float16 c,
+ FP8MulContext *ctx)
+{
+ FloatParts64 p0 = f8dot(a, b, n, ctx);
+ FloatParts64 p1 = float16_unpack_canonical(c, &ctx->stat);
+
+ p0 = parts64_addsub(&p0, &p1, &ctx->stat, false);
+ return float16_round_pack_canonical(&p0, &ctx->stat);
+}
+
+void HELPER(gvec_fmla_hb)(void *vd, void *vn, void *vm,
+ CPUARMState *env, uint32_t desc)
+{
+ FP8MulContext ctx = fp8_mul_start(env, 0xf);
+ bool high = extract32(desc, SIMD_DATA_SHIFT, 1);
+ size_t oprsz = simd_oprsz(desc);
+ size_t nelem = oprsz / 2;
+ uint8_t *n = vn;
+ uint8_t *m = vm;
+ float16 *d = vd;
+
+ for (size_t i = 0; i < nelem; i++) {
+ uint8_t e0 = n[H1(2 * i + high)];
+ uint8_t e1 = m[H1(2 * i + high)];
+
+ d[H2(i)] = f8dotadd_h(e0, e1, 1, d[H2(i)], &ctx);
+ }
+
+ fp8_mul_finish(env, &ctx);
+ clear_tail(vd, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fmla_idx_hb)(void *vd, void *vn, void *vm,
+ CPUARMState *env, uint32_t desc)
+{
+ FP8MulContext ctx = fp8_mul_start(env, 0xf);
+ bool idx_n = extract32(desc, SIMD_DATA_SHIFT, 1);
+ size_t idx_m = extract32(desc, SIMD_DATA_SHIFT + 2, 4);
+ size_t oprsz = simd_oprsz(desc);
+ size_t nelem = oprsz / 2;
+ uint8_t *n = vn;
+ uint8_t *m = vm;
+ float16 *d = vd;
+ size_t i = 0;
+
+ do {
+ uint8_t e1 = m[2 * i + H1(idx_m)];
+ do {
+ uint8_t e0 = n[H1(2 * i + idx_n)];
+ d[H2(i)] = f8dotadd_h(e0, e1, 1, d[H2(i)], &ctx);
+ } while (++i % 8 != 0);
+ } while (i < nelem);
+
+ fp8_mul_finish(env, &ctx);
+ clear_tail(vd, oprsz, simd_maxsz(desc));
+}
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index ee71c63116..1c1d4ad2f7 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -7384,6 +7384,22 @@ TRANS_FEAT(FMLSL_vi, aa64_fhm, do_fmlal_idx, a, true,
false)
TRANS_FEAT(FMLAL2_vi, aa64_fhm, do_fmlal_idx, a, false, true)
TRANS_FEAT(FMLSL2_vi, aa64_fhm, do_fmlal_idx, a, true, true)
+static bool do_fmla_fp8(DisasContext *s, arg_rxx *a,
+ gen_helper_gvec_3_ptr *fn)
+{
+ if (fpmr_access_check(s) && fp_access_check(s)) {
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ tcg_env, 16, vec_full_reg_size(s),
+ a->idxn | (a->idxm << 2), fn);
+ }
+ return true;
+}
+
+TRANS_FEAT(FMLAL_hb_v, aa64_f8fma, do_fmla_fp8, a, gen_helper_gvec_fmla_hb)
+TRANS_FEAT(FMLAL_hb_vi, aa64_f8fma, do_fmla_fp8, a,
gen_helper_gvec_fmla_idx_hb)
+
static bool do_int3_vector_idx(DisasContext *s, arg_qrrx_e *a,
gen_helper_gvec_3 * const fns[2])
{
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 6aea3ce89f..b89e83ce76 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -25,6 +25,7 @@
%esz_hsd 22:2 !function=xor_2
%hl 11:1 21:1
%hlm 11:1 20:2
+%hlm4 11:1 19:3
&r rn
&rrr rd rn rm
@@ -38,6 +39,7 @@
&rri_e rd rn imm esz
&rrr_e rd rn rm esz
&rrx_e rd rn rm idx esz
+&rxx rd rn rm idxn idxm
&rrrr_e rd rn rm ra esz
&qrr_e q rd rn esz
&qrri_e q rd rn imm esz
@@ -1204,6 +1206,9 @@ FSCALE 0.10 1110 1.1 ..... 11111 1 ..... .....
@qrrr_sd
FCVTN_bh 0.00 1110 010 ..... 11110 1 ..... ..... @qrrr_h
FCVTN_bs 0.00 1110 000 ..... 11110 1 ..... ..... @qrrr_h
+FMLAL_hb_v 0 idxn:1 00 1110 110 rm:5 11111 1 rn:5 rd:5 \
+ &rxx idxm=0
+
### Advanced SIMD scalar x indexed element
FMUL_si 0101 1111 00 .. .... 1001 . 0 ..... ..... @rrx_h
@@ -1322,6 +1327,9 @@ SQDMLAL_vi 0.00 1111 10 . ..... 0011 . 0 ..... .....
@qrrx_s
SQDMLSL_vi 0.00 1111 01 .. .... 0111 . 0 ..... ..... @qrrx_h
SQDMLSL_vi 0.00 1111 10 . ..... 0111 . 0 ..... ..... @qrrx_s
+FMLAL_hb_vi 0 idxn:1 00 1111 11 ... rm:3 0000 . 0 rn:5 rd:5 \
+ &rxx idxm=%hlm4
+
# Floating-point conditional select
FCSEL 0001 1110 .. 1 rm:5 cond:4 11 rn:5 rd:5 esz=%esz_hsd
--
2.43.0