rs6000: inline ldouble __gcc_qsub While performing some tests of IEEE 128 float for PPC64LE, Michael Meissner noticed that __gcc_qsub is substantially slower than __gcc_qadd. __gcc_qsub valls __gcc_add with the second operand negated. Because the functions normally are invoked through libgcc shared object, the extra PLT overhead has a large impact on the overall time of the function. Instead of trying to be fancy with function decorations to prevent interposition, this patch inlines the definition of __gcc_qadd into __gcc_qsub with the negation propagated through the function.
libgcc/ChangeLog: * config/rs6000/ibm-ldouble.c (__gcc_qsub): Inline negated __gcc_qadd. diff --git a/libgcc/config/rs6000/ibm-ldouble.c b/libgcc/config/rs6000/ibm-ldouble.c index 4c13453f975..ed74900e5c3 100644 --- a/libgcc/config/rs6000/ibm-ldouble.c +++ b/libgcc/config/rs6000/ibm-ldouble.c @@ -158,9 +158,42 @@ __gcc_qadd (double a, double aa, double c, double cc) } IBM128_TYPE -__gcc_qsub (double a, double b, double c, double d) +__gcc_qsub (double a, double aa, double c, double cc) { - return __gcc_qadd (a, b, -c, -d); + double xh, xl, z, q, zz; + + z = a - c; + + if (nonfinite (z)) + { + if (fabs (z) != inf()) + return z; + z = -cc + aa - c + a; + if (nonfinite (z)) + return z; + xh = z; /* Will always be DBL_MAX. */ + zz = aa - cc; + if (fabs(a) > fabs(c)) + xl = a - z - c + zz; + else + xl = -c - z + a + zz; + } + else + { + q = a - z; + zz = q - c + (a - (q + z)) + aa - cc; + + /* Keep -0 result. */ + if (zz == 0.0) + return z; + + xh = z + zz; + if (nonfinite (xh)) + return xh; + + xl = z - xh + zz; + } + return pack_ldouble (xh, xl); } #ifdef __NO_FPRS__