Re: [arm] Improve longlong.h umul_ppmm, count_trailing_zeros

2012-03-20 Thread Richard Earnshaw
On 01/02/12 13:23, Richard Earnshaw wrote:
 On 31/01/12 05:15, Richard Henderson wrote:
 Despite how trivial this is, I assume this must wait for stage1.
 Ok?


 r~


  * longlong.h [arm] (umul_ppmm): Use umull.
  [arm] (count_trailing_zeros): Use __builtin_ctz.
 
 armv3m also has the widening multiply operation (it's what the M stands
 for).
 
 Otherwise ok for stage1
 

And it's a good job we did.  I've just noticed that it's broken thumb1
builds of libgcc.

 __ctzsi2:
   0:   b508push{r3, lr}
   2:   f7ff fffe   bl  0 __ctzsi2
2: R_ARM_THM_CALL   __ctzsi2
   6:   bc08pop {r3}
   8:   bc02pop {r1}
   a:   4708bx  r1

R.



Re: [arm] Improve longlong.h umul_ppmm, count_trailing_zeros

2012-02-01 Thread Richard Earnshaw
On 31/01/12 05:15, Richard Henderson wrote:
 I noticed this accidentally, while looking for something else.
 There are significant improvements in the DImode multiplication
 and division routines for armv4+.
 
 Despite how trivial this is, I assume this must wait for stage1.
 Ok?
 
 
 r~
 
 
   * longlong.h [arm] (umul_ppmm): Use umull.
   [arm] (count_trailing_zeros): Use __builtin_ctz.

armv3m also has the widening multiply operation (it's what the M stands
for).

Otherwise ok for stage1

R.

 
 diff --git a/libgcc/longlong.h b/libgcc/longlong.h
 index 30cc2e3..7204679 100644
 --- a/libgcc/longlong.h
 +++ b/libgcc/longlong.h
 @@ -220,9 +220,12 @@ UDItype __umulsidi3 (USItype, USItype);
rI ((USItype) (bh)), \
r ((USItype) (al)),  \
rI ((USItype) (bl)) __CLOBBER_CC)
 -#define umul_ppmm(xh, xl, a, b) \
 -{register USItype __t0, __t1, __t2;  \
 -  __asm__ (%@ Inlined umul_ppmm\n  \
 +# if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \
 + || defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__)
 +#  define umul_ppmm(xh, xl, a, b)\
 +  do {   
 \
 +register USItype __t0, __t1, __t2;   
 \
 +__asm__ (%@ Inlined umul_ppmm\n
 \
  mov %2, %5, lsr #16\n  \
  mov %0, %6, lsr #16\n  \
  bic %3, %5, %2, lsl #16\n  \
 @@ -239,14 +242,26 @@ UDItype __umulsidi3 (USItype, USItype);
=r ((USItype) (xl)), \
=r (__t0), =r (__t1), =r (__t2)\
  : r ((USItype) (a)),   \
 -  r ((USItype) (b)) __CLOBBER_CC );}
 -#define UMUL_TIME 20
 -#define UDIV_TIME 100
 +  r ((USItype) (b)) __CLOBBER_CC );\
 +  } while (0)
 +#  define UMUL_TIME 20
 +# else
 +#  define umul_ppmm(xh, xl, a, b)\
 +  do {   
 \
 +/* Generate umull, under compiler control.  */   \
 +register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b);\
 +(xl) = (USItype)__t0;\
 +(xh) = (USItype)(__t0  32);\
 +  } while (0)
 +#  define UMUL_TIME 3
 +# endif
 +# define UDIV_TIME 100
  #endif /* __arm__ */
  
  #if defined(__arm__)
  /* Let gcc decide how best to implement count_leading_zeros.  */
  #define count_leading_zeros(COUNT,X) ((COUNT) = __builtin_clz (X))
 +#define count_trailing_zeros(COUNT,X)   ((COUNT) = __builtin_ctz (X))
  #define COUNT_LEADING_ZEROS_0 32
  #endif
  
 




[arm] Improve longlong.h umul_ppmm, count_trailing_zeros

2012-01-30 Thread Richard Henderson
I noticed this accidentally, while looking for something else.
There are significant improvements in the DImode multiplication
and division routines for armv4+.

Despite how trivial this is, I assume this must wait for stage1.
Ok?


r~


* longlong.h [arm] (umul_ppmm): Use umull.
[arm] (count_trailing_zeros): Use __builtin_ctz.

diff --git a/libgcc/longlong.h b/libgcc/longlong.h
index 30cc2e3..7204679 100644
--- a/libgcc/longlong.h
+++ b/libgcc/longlong.h
@@ -220,9 +220,12 @@ UDItype __umulsidi3 (USItype, USItype);
 rI ((USItype) (bh)), \
 r ((USItype) (al)),  \
 rI ((USItype) (bl)) __CLOBBER_CC)
-#define umul_ppmm(xh, xl, a, b) \
-{register USItype __t0, __t1, __t2;\
-  __asm__ (%@ Inlined umul_ppmm\n\
+# if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \
+ || defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__)
+#  define umul_ppmm(xh, xl, a, b)  \
+  do { \
+register USItype __t0, __t1, __t2; \
+__asm__ (%@ Inlined umul_ppmm\n  \
   mov %2, %5, lsr #16\n  \
   mov %0, %6, lsr #16\n  \
   bic %3, %5, %2, lsl #16\n  \
@@ -239,14 +242,26 @@ UDItype __umulsidi3 (USItype, USItype);
 =r ((USItype) (xl)), \
 =r (__t0), =r (__t1), =r (__t2)\
   : r ((USItype) (a)),   \
-r ((USItype) (b)) __CLOBBER_CC );}
-#define UMUL_TIME 20
-#define UDIV_TIME 100
+r ((USItype) (b)) __CLOBBER_CC );\
+  } while (0)
+#  define UMUL_TIME 20
+# else
+#  define umul_ppmm(xh, xl, a, b)  \
+  do { \
+/* Generate umull, under compiler control.  */ \
+register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b);  \
+(xl) = (USItype)__t0;  \
+(xh) = (USItype)(__t0  32);  \
+  } while (0)
+#  define UMUL_TIME 3
+# endif
+# define UDIV_TIME 100
 #endif /* __arm__ */
 
 #if defined(__arm__)
 /* Let gcc decide how best to implement count_leading_zeros.  */
 #define count_leading_zeros(COUNT,X)   ((COUNT) = __builtin_clz (X))
+#define count_trailing_zeros(COUNT,X)   ((COUNT) = __builtin_ctz (X))
 #define COUNT_LEADING_ZEROS_0 32
 #endif