Signed-off-by: Richard Henderson <[email protected]>
---
target/arm/tcg/helper-defs.h | 5 ++++
target/arm/tcg/translate-a64.c | 38 +++++++++++++++++++++++++
target/arm/tcg/vec_helper.c | 52 ++++++++++++++++++++++++++++++++++
target/arm/tcg/a64.decode | 6 ++++
4 files changed, 101 insertions(+)
diff --git a/target/arm/tcg/helper-defs.h b/target/arm/tcg/helper-defs.h
index a05f2258f2..05ccf795e8 100644
--- a/target/arm/tcg/helper-defs.h
+++ b/target/arm/tcg/helper-defs.h
@@ -1122,3 +1122,8 @@ DEF_HELPER_FLAGS_4(sme2_luti4_2s, TCG_CALL_NO_RWG, void,
ptr, ptr, env, i32)
DEF_HELPER_FLAGS_4(sme2_luti4_4h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_4(sme2_luti4_4s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+
+DEF_HELPER_FLAGS_4(gvec_luti2_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_luti2_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_luti4_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_luti4_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 3c784afc99..508d8e377b 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -5405,6 +5405,44 @@ static bool trans_TBL_TBX(DisasContext *s, arg_TBL_TBX
*a)
return true;
}
+static bool do_lut_1(DisasContext *s, arg_rrx_e *a, gen_helper_gvec_3 *fn)
+{
+ if (fp_access_check(s)) {
+ gen_gvec_op3_ool(s, true, a->rd, a->rn, a->rm, a->idx, fn);
+ }
+ return true;
+}
+
+TRANS_FEAT(LUTI2_1b, aa64_lut, do_lut_1, a, gen_helper_gvec_luti2_b)
+TRANS_FEAT(LUTI2_1h, aa64_lut, do_lut_1, a, gen_helper_gvec_luti2_h)
+TRANS_FEAT(LUTI4_1b, aa64_lut, do_lut_1, a, gen_helper_gvec_luti4_b)
+
+static bool trans_LUTI4_2h(DisasContext *s, arg_rrx_e *a)
+{
+ if (!dc_isar_feature(aa64_lut, s)) {
+ return false;
+ }
+ if (fp_access_check(s)) {
+ /*
+ * (Ab)use preg_tmp to merge two disjoint 128-bit quantities
+ * into a sequential 256-bit table.
+ */
+ QEMU_BUILD_BUG_ON(sizeof_field(CPUARMState, vfp.preg_tmp) < 32);
+ unsigned tmp_ofs = offsetof(CPUARMState, vfp.preg_tmp);
+ unsigned rn0_ofs = vec_full_reg_offset(s, a->rn);
+ unsigned rn1_ofs = vec_full_reg_offset(s, (a->rn + 1) % 32);
+
+ tcg_gen_gvec_mov(MO_64, tmp_ofs, rn0_ofs, 16, 16);
+ tcg_gen_gvec_mov(MO_64, tmp_ofs + 16, rn1_ofs, 16, 16);
+
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), tmp_ofs,
+ vec_full_reg_offset(s, a->rm),
+ 16, vec_full_reg_size(s),
+ a->idx, gen_helper_gvec_luti4_h);
+ }
+ return true;
+}
+
typedef int simd_permute_idx_fn(int i, int part, int elements);
static bool do_simd_permute(DisasContext *s, arg_qrrr_e *a,
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index 91e98d28ae..cb633817d7 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -3348,3 +3348,55 @@ DO_SME2_LUT(4,4,h, 2)
DO_SME2_LUT(4,4,s, 4)
#undef DO_SME2_LUT
+
+void HELPER(gvec_luti2_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ unsigned part = simd_data(desc);
+ unsigned vl = simd_oprsz(desc);
+ unsigned elements = vl / 1;
+ unsigned ibase = elements * part;
+ ARMVectorReg scratch;
+
+ do_lut_b(&scratch, vm, vn, elements, ibase, 0, 2, 8, 1);
+ memcpy(vd, &scratch, vl);
+ clear_tail(vd, vl, simd_maxsz(desc));
+}
+
+void HELPER(gvec_luti2_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ unsigned part = simd_data(desc);
+ unsigned vl = simd_oprsz(desc);
+ unsigned elements = vl / 2;
+ unsigned ibase = elements * part;
+ ARMVectorReg scratch;
+
+ do_lut_h(&scratch, vm, vn, elements, ibase, 0, 2, 16, 1);
+ memcpy(vd, &scratch, vl);
+ clear_tail(vd, vl, simd_maxsz(desc));
+}
+
+void HELPER(gvec_luti4_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ unsigned part = simd_data(desc);
+ unsigned vl = simd_oprsz(desc);
+ unsigned elements = vl / 1;
+ unsigned ibase = elements * part;
+ ARMVectorReg scratch;
+
+ do_lut_b(&scratch, vm, vn, elements, ibase, 0, 4, 8, 1);
+ memcpy(vd, &scratch, vl);
+ clear_tail(vd, vl, simd_maxsz(desc));
+}
+
+void HELPER(gvec_luti4_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ unsigned part = simd_data(desc);
+ unsigned vl = simd_oprsz(desc);
+ unsigned elements = vl / 2;
+ unsigned ibase = elements * part;
+ ARMVectorReg scratch;
+
+ do_lut_h(&scratch, vm, vn, elements, ibase, 0, 4, 16, 1);
+ memcpy(vd, &scratch, vl);
+ clear_tail(vd, vl, simd_maxsz(desc));
+}
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index a9cf259b9b..6aea3ce89f 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1344,6 +1344,12 @@ EXT_q 0110 1110 00 0 rm:5 0 imm:4 0 rn:5 rd:5
TBL_TBX 0 q:1 00 1110 000 rm:5 0 len:2 tbx:1 00 rn:5 rd:5
+LUTI2_1b 0100 1110 100 rm:5 0 idx:2 100 rn:5 rd:5 &rrx_e esz=0
+LUTI2_1h 0100 1110 110 rm:5 0 idx:3 00 rn:5 rd:5 &rrx_e esz=1
+
+LUTI4_1b 0100 1110 010 rm:5 0 idx:1 1000 rn:5 rd:5 &rrx_e esz=0
+LUTI4_2h 0100 1110 010 rm:5 0 idx:2 100 rn:5 rd:5 &rrx_e esz=1
+
# Advanced SIMD Permute
UZP1 0.00 1110 .. 0 ..... 0 001 10 ..... ..... @qrrr_e
--
2.43.0