Re: [Qemu-devel] [PATCH v2 07/32] arm/translate-a64: implement half-precision F(MIN|MAX)(V|NMV)

2018-02-08 Thread Richard Henderson
On 02/08/2018 09:31 AM, Alex Bennée wrote:
> +DEF_HELPER_3(advsimd_maxh, f16, f16, f16, ptr)
> +DEF_HELPER_3(advsimd_minh, f16, f16, f16, ptr)
> +DEF_HELPER_3(advsimd_maxnumh, f16, f16, f16, ptr)
> +DEF_HELPER_3(advsimd_minnumh, f16, f16, f16, ptr)

DEF_HELPER_FLAGS_3 with TCG_CALL_NO_RWG.


r~



Re: [Qemu-devel] [PATCH v2 07/32] arm/translate-a64: implement half-precision F(MIN|MAX)(V|NMV)

2018-02-08 Thread Richard Henderson
On 02/08/2018 09:31 AM, Alex Bennée wrote:
> This implements the half-precision variants of the across vector
> reduction operations. This involves a re-factor of the reduction code
> which more closely matches the ARM ARM order (and handles 8 element
> reductions).
> 
> Signed-off-by: Alex Bennée 
> 
> --
> v1
>   - dropped the advsimd_2a stuff
> v2
>   - fixed up checkpatch
> ---
>  target/arm/helper-a64.c|  18 ++
>  target/arm/helper-a64.h|   4 ++
>  target/arm/translate-a64.c | 144 
> -
>  3 files changed, 111 insertions(+), 55 deletions(-)

Reviewed-by: Richard Henderson 


r~




[Qemu-devel] [PATCH v2 07/32] arm/translate-a64: implement half-precision F(MIN|MAX)(V|NMV)

2018-02-08 Thread Alex Bennée
This implements the half-precision variants of the across vector
reduction operations. This involves a re-factor of the reduction code
which more closely matches the ARM ARM order (and handles 8 element
reductions).

Signed-off-by: Alex Bennée 

--
v1
  - dropped the advsimd_2a stuff
v2
  - fixed up checkpatch
---
 target/arm/helper-a64.c|  18 ++
 target/arm/helper-a64.h|   4 ++
 target/arm/translate-a64.c | 144 -
 3 files changed, 111 insertions(+), 55 deletions(-)

diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
index 10e08bdc1f..fddd5d242b 100644
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@@ -572,3 +572,21 @@ uint64_t HELPER(paired_cmpxchg64_be_parallel)(CPUARMState 
*env, uint64_t addr,
 {
 return do_paired_cmpxchg64_be(env, addr, new_lo, new_hi, true, GETPC());
 }
+
+/*
+ * AdvSIMD half-precision
+ */
+
+#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
+
+#define ADVSIMD_HALFOP(name) \
+float16 ADVSIMD_HELPER(name, h)(float16 a, float16 b, void *fpstp) \
+{ \
+float_status *fpst = fpstp; \
+return float16_ ## name(a, b, fpst);\
+}
+
+ADVSIMD_HALFOP(min)
+ADVSIMD_HALFOP(max)
+ADVSIMD_HALFOP(minnum)
+ADVSIMD_HALFOP(maxnum)
diff --git a/target/arm/helper-a64.h b/target/arm/helper-a64.h
index 85d86741db..b69a557241 100644
--- a/target/arm/helper-a64.h
+++ b/target/arm/helper-a64.h
@@ -48,3 +48,7 @@ DEF_HELPER_FLAGS_4(paired_cmpxchg64_le_parallel, 
TCG_CALL_NO_WG,
 DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, 
i64)
 DEF_HELPER_FLAGS_4(paired_cmpxchg64_be_parallel, TCG_CALL_NO_WG,
i64, env, i64, i64, i64)
+DEF_HELPER_3(advsimd_maxh, f16, f16, f16, ptr)
+DEF_HELPER_3(advsimd_minh, f16, f16, f16, ptr)
+DEF_HELPER_3(advsimd_maxnumh, f16, f16, f16, ptr)
+DEF_HELPER_3(advsimd_minnumh, f16, f16, f16, ptr)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 531ac5999c..f778886abc 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -5650,26 +5650,75 @@ static void disas_simd_zip_trn(DisasContext *s, 
uint32_t insn)
 tcg_temp_free_i64(tcg_resh);
 }
 
-static void do_minmaxop(DisasContext *s, TCGv_i32 tcg_elt1, TCGv_i32 tcg_elt2,
-int opc, bool is_min, TCGv_ptr fpst)
-{
-/* Helper function for disas_simd_across_lanes: do a single precision
- * min/max operation on the specified two inputs,
- * and return the result in tcg_elt1.
- */
-if (opc == 0xc) {
-if (is_min) {
-gen_helper_vfp_minnums(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
-} else {
-gen_helper_vfp_maxnums(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
-}
+/*
+ * do_reduction_op helper
+ *
+ * This mirrors the Reduce() pseudocode in the ARM ARM. It is
+ * important for correct NaN propagation that we do these
+ * operations in exactly the order specified by the pseudocode.
+ *
+ * This is a recursive function, TCG temps should be freed by the
+ * calling function once it is done with the values.
+ */
+static TCGv_i32 do_reduction_op(DisasContext *s, int fpopcode, int rn,
+int esize, int size, int vmap, TCGv_ptr fpst)
+{
+if (esize == size) {
+int element;
+TCGMemOp msize = esize == 16 ? MO_16 : MO_32;
+TCGv_i32 tcg_elem;
+
+/* We should have one register left here */
+assert(ctpop8(vmap) == 1);
+element = ctz32(vmap);
+assert(element < 8);
+
+tcg_elem = tcg_temp_new_i32();
+read_vec_element_i32(s, tcg_elem, rn, element, msize);
+return tcg_elem;
 } else {
-assert(opc == 0xf);
-if (is_min) {
-gen_helper_vfp_mins(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
-} else {
-gen_helper_vfp_maxs(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
+int bits = size / 2;
+int shift = ctpop8(vmap) / 2;
+int vmap_lo = (vmap >> shift) & vmap;
+int vmap_hi = (vmap & ~vmap_lo);
+TCGv_i32 tcg_hi, tcg_lo, tcg_res;
+
+tcg_hi = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_hi, fpst);
+tcg_lo = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_lo, fpst);
+tcg_res = tcg_temp_new_i32();
+
+switch (fpopcode) {
+case 0x0c: /* fmaxnmv half-precision */
+gen_helper_advsimd_maxnumh(tcg_res, tcg_lo, tcg_hi, fpst);
+break;
+case 0x0f: /* fmaxv half-precision */
+gen_helper_advsimd_maxh(tcg_res, tcg_lo, tcg_hi, fpst);
+break;
+case 0x1c: /* fminnmv half-precision */
+gen_helper_advsimd_minnumh(tcg_res, tcg_lo, tcg_hi, fpst);
+break;
+case 0x1f: /* fminv half-precision */
+gen_helper_advsimd_minh(tcg_res, tcg_lo, tcg_hi, fpst);
+break;
+case 0x2c: /* fmaxnmv */
+gen_helper_vfp_maxnums(tcg_res,