On Mon, 20 Jun 2022 at 19:07, Richard Henderson <richard.hender...@linaro.org> wrote: > > Signed-off-by: Richard Henderson <richard.hender...@linaro.org>
> +void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn, > + void *vpm, void *vst, uint32_t desc) > +{ > + intptr_t row, col, oprsz = simd_maxsz(desc); > + uint32_t neg = simd_data(desc) << 31; > + uint16_t *pn = vpn, *pm = vpm; > + > + bool save_dn = get_default_nan_mode(vst); > + set_default_nan_mode(true, vst); > + > + for (row = 0; row < oprsz; ) { > + uint16_t pa = pn[H2(row >> 4)]; > + do { > + if (pa & 1) { > + void *vza_row = vza + row * sizeof(ARMVectorReg); > + uint32_t n = *(uint32_t *)(vzn + row) ^ neg; > + > + for (col = 0; col < oprsz; ) { > + uint16_t pb = pm[H2(col >> 4)]; > + do { > + if (pb & 1) { > + uint32_t *a = vza_row + col; > + uint32_t *m = vzm + col; > + *a = float32_muladd(n, *m, *a, 0, vst); > + } > + col += 4; > + pb >>= 4; > + } while (col & 15); > + } > + } > + row += 4; > + pa >>= 4; > + } while (row & 15); > + } The code for the double version seems straightforward: row counts from 0 up to the number of rows, and we do something per row. Why is the single precision version doing something with an unrolled loop here? It's confusing that 'oprsz' in the two functions isn't the same thing -- in the double version we divide by the element size, but here we don't. > + > + set_default_nan_mode(save_dn, vst); > +} > + > +void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn, > + void *vpm, void *vst, uint32_t desc) > +{ > + intptr_t row, col, oprsz = simd_oprsz(desc) / 8; > + uint64_t neg = (uint64_t)simd_data(desc) << 63; > + uint64_t *za = vza, *zn = vzn, *zm = vzm; > + uint8_t *pn = vpn, *pm = vpm; > + > + bool save_dn = get_default_nan_mode(vst); > + set_default_nan_mode(true, vst); > + > + for (row = 0; row < oprsz; ++row) { > + if (pn[H1(row)] & 1) { > + uint64_t *za_row = &za[row * sizeof(ARMVectorReg)]; > + uint64_t n = zn[row] ^ neg; > + > + for (col = 0; col < oprsz; ++col) { > + if (pm[H1(col)] & 1) { > + uint64_t *a = &za_row[col]; > + *a = float64_muladd(n, zm[col], *a, 0, vst); > + } > + } > + } > + } > + > + set_default_nan_mode(save_dn, vst); > +} The pseudocode says that we ignore floating point exceptions (ie do not accumulate them in the FPSR) -- it passes fpexc == false to FPMulAdd(). Don't we need to do something special to arrange for that ? thanks -- PMM