rs6000: inline ldouble __gcc_qsub

    While performing some tests of IEEE 128 float for PPC64LE, Michael
    Meissner noticed that __gcc_qsub is substantially slower than
    __gcc_qadd.  __gcc_qsub valls __gcc_add with the second operand
    negated.  Because the functions normally are invoked through
    libgcc shared object, the extra PLT overhead has a large impact
    on the overall time of the function.  Instead of trying to be
    fancy with function decorations to prevent interposition, this
    patch inlines the definition of __gcc_qadd into __gcc_qsub with
    the negation propagated through the function.

    libgcc/ChangeLog:

            * config/rs6000/ibm-ldouble.c (__gcc_qsub): Inline negated
__gcc_qadd.

diff --git a/libgcc/config/rs6000/ibm-ldouble.c
b/libgcc/config/rs6000/ibm-ldouble.c
index 4c13453f975..ed74900e5c3 100644
--- a/libgcc/config/rs6000/ibm-ldouble.c
+++ b/libgcc/config/rs6000/ibm-ldouble.c
@@ -158,9 +158,42 @@ __gcc_qadd (double a, double aa, double c, double cc)
 }

 IBM128_TYPE
-__gcc_qsub (double a, double b, double c, double d)
+__gcc_qsub (double a, double aa, double c, double cc)
 {
-  return __gcc_qadd (a, b, -c, -d);
+  double xh, xl, z, q, zz;
+
+  z = a - c;
+
+  if (nonfinite (z))
+    {
+      if (fabs (z) != inf())
+       return z;
+      z = -cc + aa - c + a;
+      if (nonfinite (z))
+       return z;
+      xh = z;  /* Will always be DBL_MAX.  */
+      zz = aa - cc;
+      if (fabs(a) > fabs(c))
+       xl = a - z - c + zz;
+      else
+       xl = -c - z + a + zz;
+    }
+  else
+    {
+      q = a - z;
+      zz = q - c + (a - (q + z)) + aa - cc;
+
+      /* Keep -0 result.  */
+      if (zz == 0.0)
+       return z;
+
+      xh = z + zz;
+      if (nonfinite (xh))
+       return xh;
+
+      xl = z - xh + zz;
+    }
+  return pack_ldouble (xh, xl);
 }

 #ifdef __NO_FPRS__

Reply via email to