Re: [PATCH 31/57] target/arm: Convert SUQADD and USQADD to gvec

2024-05-22 Thread Richard Henderson

On 5/22/24 03:01, Peter Maydell wrote:

(Did you do much risu testing on this series?)


Yes.  Complete run of my test cases on every patch, with -cpu neoverse-n1, traces 
re-generated on aarch64.ci.  That said, qemu master does not pass 100%, failures not 
investigated, merely no regressions.



r~




Re: [PATCH 31/57] target/arm: Convert SUQADD and USQADD to gvec

2024-05-22 Thread Peter Maydell
On Mon, 6 May 2024 at 02:08, Richard Henderson
 wrote:
>
> Signed-off-by: Richard Henderson 
> ---
>  target/arm/helper.h|  16 +
>  target/arm/tcg/translate-a64.h |   6 ++
>  target/arm/tcg/gengvec64.c | 106 +++
>  target/arm/tcg/translate-a64.c | 113 ++---
>  target/arm/tcg/vec_helper.c|  64 +++
>  5 files changed, 241 insertions(+), 64 deletions(-)

Somewhere in this patch we break the setting of the QC bit
for SUQADD vector insns. (I was misreading my test insn
when I made my remark in the other thread, didn't
notice it was vector not scalar). Here's an isolated C
test example:

#include 
#include 

static void do_op(void *o, const void *a, const void *b, uint64_t *fpsr)
{
asm volatile(
"ld1 { v0.16b }, [%1]\n"
"ld1 { v1.16b }, [%2]\n"
"msr fpsr, xzr\n"
"suqadd v0.4h, v1.4h\n"
"mrs x5, fpsr\n"
"str x5, [%3]\n"
"st1 { v0.16b }, [%0]\n"
: : "r"(o), "r"(a), "r"(b), "r"(fpsr) : "v0", "v1", "x5", "memory");
}

int main(void)
{
uint64_t a[] = { 0xc000, 0x0 };
uint64_t b[] = { 0x5000, 0x0 };
uint64_t c[] = { 0, 0 };
uint64_t fpsr = 0;

printf("a: 0x%lx : %lx\n", a[1], a[0]);
printf("b: 0x%lx : %lx\n", b[1], b[0]);
do_op(, , , );

printf("result: 0x%lx : %lx\n", c[1], c[0]);
printf("fpsr: 0x%lx\n", fpsr);
return 0;
}
(build with aarch64-linux-gnu-gcc -o suqadd suqadd.c -static
and run with qemu-aarch64 suqadd)

After this patch we still generate the right result, but we
start setting the QC bit.

(Did you do much risu testing on this series?)

thanks
-- PMM



[PATCH 31/57] target/arm: Convert SUQADD and USQADD to gvec

2024-05-05 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 target/arm/helper.h|  16 +
 target/arm/tcg/translate-a64.h |   6 ++
 target/arm/tcg/gengvec64.c | 106 +++
 target/arm/tcg/translate-a64.c | 113 ++---
 target/arm/tcg/vec_helper.c|  64 +++
 5 files changed, 241 insertions(+), 64 deletions(-)

diff --git a/target/arm/helper.h b/target/arm/helper.h
index f830531dd3..de2c5c9aef 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -836,6 +836,22 @@ DEF_HELPER_FLAGS_5(gvec_sqsub_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(gvec_sqsub_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_usqadd_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_usqadd_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_usqadd_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_usqadd_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_suqadd_b, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_suqadd_h, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_suqadd_s, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_suqadd_d, TCG_CALL_NO_RWG,
+   void, ptr, ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_5(gvec_fmlal_a32, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/tcg/translate-a64.h b/target/arm/tcg/translate-a64.h
index 91750f0ca9..b5cb26f8a2 100644
--- a/target/arm/tcg/translate-a64.h
+++ b/target/arm/tcg/translate-a64.h
@@ -197,6 +197,12 @@ void gen_gvec_eor3(unsigned vece, uint32_t d, uint32_t n, 
uint32_t m,
uint32_t a, uint32_t oprsz, uint32_t maxsz);
 void gen_gvec_bcax(unsigned vece, uint32_t d, uint32_t n, uint32_t m,
uint32_t a, uint32_t oprsz, uint32_t maxsz);
+void gen_gvec_suqadd_qc(unsigned vece, uint32_t rd_ofs,
+uint32_t rn_ofs, uint32_t rm_ofs,
+uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_usqadd_qc(unsigned vece, uint32_t rd_ofs,
+uint32_t rn_ofs, uint32_t rm_ofs,
+uint32_t opr_sz, uint32_t max_sz);
 
 void gen_sve_ldr(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int 
imm);
 void gen_sve_str(DisasContext *s, TCGv_ptr, int vofs, int len, int rn, int 
imm);
diff --git a/target/arm/tcg/gengvec64.c b/target/arm/tcg/gengvec64.c
index 093b498b13..201a719bc1 100644
--- a/target/arm/tcg/gengvec64.c
+++ b/target/arm/tcg/gengvec64.c
@@ -188,3 +188,109 @@ void gen_gvec_bcax(unsigned vece, uint32_t d, uint32_t n, 
uint32_t m,
 tcg_gen_gvec_4(d, n, m, a, oprsz, maxsz, );
 }
 
+static void gen_suqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
+   TCGv_vec a, TCGv_vec b)
+{
+TCGv_vec max =
+tcg_constant_vec_matching(t, vece, (1ull << ((8 << vece) - 1)) - 1);
+TCGv_vec u = tcg_temp_new_vec_matching(t);
+
+/* Maximum value that can be added to @a without overflow. */
+tcg_gen_sub_vec(vece, u, max, a);
+
+/* Constrain addend so that the next addition never overflows. */
+tcg_gen_umin_vec(vece, t, u, b);
+tcg_gen_add_vec(vece, t, t, a);
+
+/* Compute QC by comparing the adjusted @b. */
+tcg_gen_xor_vec(vece, u, u, b);
+tcg_gen_or_vec(vece, qc, qc, u);
+}
+
+void gen_gvec_suqadd_qc(unsigned vece, uint32_t rd_ofs,
+uint32_t rn_ofs, uint32_t rm_ofs,
+uint32_t opr_sz, uint32_t max_sz)
+{
+static const TCGOpcode vecop_list[] = {
+INDEX_op_add_vec, INDEX_op_sub_vec, INDEX_op_umin_vec, 0
+};
+static const GVecGen4 ops[4] = {
+{ .fniv = gen_suqadd_vec,
+  .fno = gen_helper_gvec_suqadd_b,
+  .opt_opc = vecop_list,
+  .write_aofs = true,
+  .vece = MO_8 },
+{ .fniv = gen_suqadd_vec,
+  .fno = gen_helper_gvec_suqadd_h,
+  .opt_opc = vecop_list,
+  .write_aofs = true,
+  .vece = MO_16 },
+{ .fniv = gen_suqadd_vec,
+  .fno = gen_helper_gvec_suqadd_s,
+  .opt_opc = vecop_list,
+  .write_aofs = true,
+  .vece = MO_32 },
+{ .fniv = gen_suqadd_vec,
+  .fno = gen_helper_gvec_suqadd_d,
+  .opt_opc = vecop_list,
+  .write_aofs = true,
+  .vece = MO_64 },
+};
+tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc),
+   rn_ofs, rm_ofs, opr_sz, max_sz, [vece]);
+}
+
+static void gen_usqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
+   TCGv_vec a, TCGv_vec b)
+{
+TCGv_vec u = tcg_temp_new_vec_matching(t);
+TCGv_vec z =