Re: [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL
On Thu, Feb 14, 2019 at 3:56 PM Richard Henderson wrote: > > On 2/14/19 1:16 AM, Laurent Desnogues wrote: > > Hello, > > > > On Thu, Feb 14, 2019 at 5:00 AM Richard Henderson > > wrote: > >> > >> Note that float16_to_float32 rightly squashes SNaN to QNaN. > >> But of course pickNaNMulAdd, for ARM, selects SNaNs first. > >> So we have to preserve SNaN long enough for the correct NaN > >> to be selected. Thus float16_to_float32_by_bits. > >> > >> Signed-off-by: Richard Henderson > >> --- > >> target/arm/helper.h | 9 +++ > >> target/arm/vec_helper.c | 154 > >> 2 files changed, 163 insertions(+) > >> > >> diff --git a/target/arm/helper.h b/target/arm/helper.h > >> index 53a38188c6..0302e13604 100644 > >> --- a/target/arm/helper.h > >> +++ b/target/arm/helper.h > >> @@ -653,6 +653,15 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG, > >> DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG, > >> void, ptr, ptr, ptr, ptr, ptr, i32) > >> > >> +DEF_HELPER_FLAGS_5(gvec_fmlal_h, TCG_CALL_NO_RWG, > >> + void, ptr, ptr, ptr, ptr, i32) > >> +DEF_HELPER_FLAGS_5(gvec_fmlsl_h, TCG_CALL_NO_RWG, > >> + void, ptr, ptr, ptr, ptr, i32) > >> +DEF_HELPER_FLAGS_5(gvec_fmlal_idx_h, TCG_CALL_NO_RWG, > >> + void, ptr, ptr, ptr, ptr, i32) > >> +DEF_HELPER_FLAGS_5(gvec_fmlsl_idx_h, TCG_CALL_NO_RWG, > >> + void, ptr, ptr, ptr, ptr, i32) > >> + > >> #ifdef TARGET_AARCH64 > >> #include "helper-a64.h" > >> #include "helper-sve.h" > >> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c > >> index 37f338732e..0c3b3de961 100644 > >> --- a/target/arm/vec_helper.c > >> +++ b/target/arm/vec_helper.c > >> @@ -766,3 +766,157 @@ DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) > >> DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) > >> > >> #undef DO_FMLA_IDX > >> + > >> +/* > >> + * Convert float16 to float32, raising no exceptions and > >> + * preserving exceptional values, including SNaN. > >> + * This is effectively an unpack+repack operation. > >> + */ > >> +static float32 float16_to_float32_by_bits(uint32_t f16) > >> +{ > >> +const int f16_bias = 15; > >> +const int f32_bias = 127; > >> +uint32_t sign = extract32(f16, 15, 1); > >> +uint32_t exp = extract32(f16, 10, 5); > >> +uint32_t frac = extract32(f16, 0, 10); > >> + > >> +if (exp == 0x1f) { > >> +/* Inf or NaN */ > >> +exp = 0xff; > >> +} else if (exp == 0) { > >> +/* Zero or denormal. */ > >> +if (frac != 0) { > >> +/* > >> + * Denormal; these are all normal float32. > >> + * Shift the fraction so that the msb is at bit 11, > >> + * then remove bit 11 as the implicit bit of the > >> + * normalized float32. Note that we still go through > >> + * the shift for normal numbers below, to put the > >> + * float32 fraction at the right place. > >> + */ > >> +int shift = clz32(frac) - 21; > >> +frac = (frac << shift) & 0x3ff; > >> +exp = f32_bias - f16_bias - shift + 1; > > > > If FZ16 is set, this should flush to zero. > > Ho, hum, yes it should. > > > This means you will have to use both fp_status (for the muladd) and > > fp_status_f16 (for this function) and so you should pass cpu_env to > > the helpers rather than the fp_status. > > It's not quite as simple as that, because aa32 mode would pass > standard_fp_status. I'll figure something out... Ha yes, I only looked at AArch64... as usual :-( Thanks, Laurent > > r~
Re: [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL
On 2/14/19 1:16 AM, Laurent Desnogues wrote: > Hello, > > On Thu, Feb 14, 2019 at 5:00 AM Richard Henderson > wrote: >> >> Note that float16_to_float32 rightly squashes SNaN to QNaN. >> But of course pickNaNMulAdd, for ARM, selects SNaNs first. >> So we have to preserve SNaN long enough for the correct NaN >> to be selected. Thus float16_to_float32_by_bits. >> >> Signed-off-by: Richard Henderson >> --- >> target/arm/helper.h | 9 +++ >> target/arm/vec_helper.c | 154 >> 2 files changed, 163 insertions(+) >> >> diff --git a/target/arm/helper.h b/target/arm/helper.h >> index 53a38188c6..0302e13604 100644 >> --- a/target/arm/helper.h >> +++ b/target/arm/helper.h >> @@ -653,6 +653,15 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG, >> DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG, >> void, ptr, ptr, ptr, ptr, ptr, i32) >> >> +DEF_HELPER_FLAGS_5(gvec_fmlal_h, TCG_CALL_NO_RWG, >> + void, ptr, ptr, ptr, ptr, i32) >> +DEF_HELPER_FLAGS_5(gvec_fmlsl_h, TCG_CALL_NO_RWG, >> + void, ptr, ptr, ptr, ptr, i32) >> +DEF_HELPER_FLAGS_5(gvec_fmlal_idx_h, TCG_CALL_NO_RWG, >> + void, ptr, ptr, ptr, ptr, i32) >> +DEF_HELPER_FLAGS_5(gvec_fmlsl_idx_h, TCG_CALL_NO_RWG, >> + void, ptr, ptr, ptr, ptr, i32) >> + >> #ifdef TARGET_AARCH64 >> #include "helper-a64.h" >> #include "helper-sve.h" >> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c >> index 37f338732e..0c3b3de961 100644 >> --- a/target/arm/vec_helper.c >> +++ b/target/arm/vec_helper.c >> @@ -766,3 +766,157 @@ DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) >> DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) >> >> #undef DO_FMLA_IDX >> + >> +/* >> + * Convert float16 to float32, raising no exceptions and >> + * preserving exceptional values, including SNaN. >> + * This is effectively an unpack+repack operation. >> + */ >> +static float32 float16_to_float32_by_bits(uint32_t f16) >> +{ >> +const int f16_bias = 15; >> +const int f32_bias = 127; >> +uint32_t sign = extract32(f16, 15, 1); >> +uint32_t exp = extract32(f16, 10, 5); >> +uint32_t frac = extract32(f16, 0, 10); >> + >> +if (exp == 0x1f) { >> +/* Inf or NaN */ >> +exp = 0xff; >> +} else if (exp == 0) { >> +/* Zero or denormal. */ >> +if (frac != 0) { >> +/* >> + * Denormal; these are all normal float32. >> + * Shift the fraction so that the msb is at bit 11, >> + * then remove bit 11 as the implicit bit of the >> + * normalized float32. Note that we still go through >> + * the shift for normal numbers below, to put the >> + * float32 fraction at the right place. >> + */ >> +int shift = clz32(frac) - 21; >> +frac = (frac << shift) & 0x3ff; >> +exp = f32_bias - f16_bias - shift + 1; > > If FZ16 is set, this should flush to zero. Ho, hum, yes it should. > This means you will have to use both fp_status (for the muladd) and > fp_status_f16 (for this function) and so you should pass cpu_env to > the helpers rather than the fp_status. It's not quite as simple as that, because aa32 mode would pass standard_fp_status. I'll figure something out... r~
Re: [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL
Hello, On Thu, Feb 14, 2019 at 5:00 AM Richard Henderson wrote: > > Note that float16_to_float32 rightly squashes SNaN to QNaN. > But of course pickNaNMulAdd, for ARM, selects SNaNs first. > So we have to preserve SNaN long enough for the correct NaN > to be selected. Thus float16_to_float32_by_bits. > > Signed-off-by: Richard Henderson > --- > target/arm/helper.h | 9 +++ > target/arm/vec_helper.c | 154 > 2 files changed, 163 insertions(+) > > diff --git a/target/arm/helper.h b/target/arm/helper.h > index 53a38188c6..0302e13604 100644 > --- a/target/arm/helper.h > +++ b/target/arm/helper.h > @@ -653,6 +653,15 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG, > DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG, > void, ptr, ptr, ptr, ptr, ptr, i32) > > +DEF_HELPER_FLAGS_5(gvec_fmlal_h, TCG_CALL_NO_RWG, > + void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_fmlsl_h, TCG_CALL_NO_RWG, > + void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_fmlal_idx_h, TCG_CALL_NO_RWG, > + void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_fmlsl_idx_h, TCG_CALL_NO_RWG, > + void, ptr, ptr, ptr, ptr, i32) > + > #ifdef TARGET_AARCH64 > #include "helper-a64.h" > #include "helper-sve.h" > diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c > index 37f338732e..0c3b3de961 100644 > --- a/target/arm/vec_helper.c > +++ b/target/arm/vec_helper.c > @@ -766,3 +766,157 @@ DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) > DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) > > #undef DO_FMLA_IDX > + > +/* > + * Convert float16 to float32, raising no exceptions and > + * preserving exceptional values, including SNaN. > + * This is effectively an unpack+repack operation. > + */ > +static float32 float16_to_float32_by_bits(uint32_t f16) > +{ > +const int f16_bias = 15; > +const int f32_bias = 127; > +uint32_t sign = extract32(f16, 15, 1); > +uint32_t exp = extract32(f16, 10, 5); > +uint32_t frac = extract32(f16, 0, 10); > + > +if (exp == 0x1f) { > +/* Inf or NaN */ > +exp = 0xff; > +} else if (exp == 0) { > +/* Zero or denormal. */ > +if (frac != 0) { > +/* > + * Denormal; these are all normal float32. > + * Shift the fraction so that the msb is at bit 11, > + * then remove bit 11 as the implicit bit of the > + * normalized float32. Note that we still go through > + * the shift for normal numbers below, to put the > + * float32 fraction at the right place. > + */ > +int shift = clz32(frac) - 21; > +frac = (frac << shift) & 0x3ff; > +exp = f32_bias - f16_bias - shift + 1; If FZ16 is set, this should flush to zero. This means you will have to use both fp_status (for the muladd) and fp_status_f16 (for this function) and so you should pass cpu_env to the helpers rather than the fp_status. Thanks, Laurent > +} > +} else { > +/* Normal number; adjust the bias. */ > +exp += f32_bias - f16_bias; > +} > +sign <<= 31; > +exp <<= 23; > +frac <<= 23 - 10; > + > +return sign | exp | frac; > +} > + > +static float32 fmlal(float32 a, float16 n16, float16 m16, float_status *fpst) > +{ > +float32 n = float16_to_float32_by_bits(n16); > +float32 m = float16_to_float32_by_bits(m16); > +return float32_muladd(n, m, a, 0, fpst); > +} > + > +static float32 fmlsl(float32 a, float16 n16, float16 m16, float_status *fpst) > +{ > +float32 n = float16_to_float32_by_bits(n16); > +float32 m = float16_to_float32_by_bits(m16); > +return float32_muladd(float32_chs(n), m, a, 0, fpst); > +} > + > +static inline uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) > +{ > +/* > + * Branchless load of u32[0], u64[0], u32[1], or u64[1]. > + * Load the 2nd qword iff is_q & is_2. > + * Shift to the 2nd dword iff !is_q & is_2. > + * For !is_q & !is_2, the upper bits of the result are garbage. > + */ > +return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); > +} > + > +/* > + * Note that FMLAL and FMLSL require oprsz == 8 or oprsz == 16, > + * as there is not yet SVE versions that might use blocking. > + */ > + > +void HELPER(gvec_fmlal_h)(void *vd, void *vn, void *vm, > + void *fpst, uint32_t desc) > +{ > +intptr_t i, oprsz = simd_oprsz(desc); > +int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1); > +int is_q = oprsz == 16; > +float32 *d = vd; > +uint64_t n_4, m_4; > + > +/* Pre-load all of the f16 data, avoiding overlap issues. */ > +n_4 = load4_f16(vn, is_q, is_2); > +m_4 = load4_f16(vm, is_q, is_2); > + > +for (i = 0; i < oprsz / 4; i++) { > +d[H4(i)] = fmlal(d[H4(i)], extract64(n_4, i*16, 16), > + extract64(m_4, i*16, 16), fpst); > +