On 10/13/2017 09:24 AM, Alex Bennée wrote: > +float16 float16_rem(float16 a, float16 b, float_status *status) > +{ > + flag aSign, zSign; > + int aExp, bExp, expDiff; > + uint32_t aSig, bSig; > + uint32_t q; > + uint64_t aSig64, bSig64, q64; > + uint32_t alternateASig; > + int32_t sigMean; > + a = float16_squash_input_denormal(a, status); > + b = float16_squash_input_denormal(b, status); > + > + aSig = extractFloat32Frac( a ); > + aExp = extractFloat32Exp( a ); > + aSign = extractFloat32Sign( a ); > + bSig = extractFloat32Frac( b ); > + bExp = extractFloat32Exp( b ); > + if ( aExp == 0xFF ) { > + if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { > + return propagateFloat32NaN(a, b, status); > + } > + float_raise(float_flag_invalid, status); > + return float16_default_nan(status); > + } > + if ( bExp == 0xFF ) { > + if (bSig) { > + return propagateFloat32NaN(a, b, status); > + } > + return a; > + }
s/0xff/0x1f/. > + if ( bExp == 0 ) { > + if ( bSig == 0 ) { > + float_raise(float_flag_invalid, status); > + return float16_default_nan(status); > + } > + normalizeFloat32Subnormal( bSig, &bExp, &bSig ); > + } > + if ( aExp == 0 ) { > + if ( aSig == 0 ) return a; > + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); > + } > + expDiff = aExp - bExp; > + aSig |= 0x00800000; > + bSig |= 0x00800000; These implicit bits are set for float32. > + if ( expDiff < 32 ) { > + aSig <<= 8; > + bSig <<= 8; This algorithm isn't going to work unless the fractions are normalized to 0b1xxx_xxxx_xxx0_0000, just like for float32. Indeed, I think you should actually share code. > + return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); ... but you really will have to pack into the correct format. > + if (((aExp == 0xff) && aSig) || > + ((bExp == 0xff) && bSig) || > + ((cExp == 0xff) && cSig)) { 0x1f, lots more times. > + /* Calculate the actual result a * b + c */ > + > + /* Multiply first; this is easy. */ > + /* NB: we subtract 0x7e where float16_mul() subtracts 0x7f > + * because we want the true exponent, not the "one-less-than" > + * flavour that roundAndPackFloat16() takes. > + */ > + pExp = aExp + bExp - 0x7e; > + aSig = (aSig | 0x00800000) << 7; > + bSig = (bSig | 0x00800000) << 8; All of these constants are for float32, not float16. r~