Re: [arm] Improve longlong.h umul_ppmm, count_trailing_zeros
On 01/02/12 13:23, Richard Earnshaw wrote: On 31/01/12 05:15, Richard Henderson wrote: Despite how trivial this is, I assume this must wait for stage1. Ok? r~ * longlong.h [arm] (umul_ppmm): Use umull. [arm] (count_trailing_zeros): Use __builtin_ctz. armv3m also has the widening multiply operation (it's what the M stands for). Otherwise ok for stage1 And it's a good job we did. I've just noticed that it's broken thumb1 builds of libgcc. __ctzsi2: 0: b508push{r3, lr} 2: f7ff fffe bl 0 __ctzsi2 2: R_ARM_THM_CALL __ctzsi2 6: bc08pop {r3} 8: bc02pop {r1} a: 4708bx r1 R.
Re: [arm] Improve longlong.h umul_ppmm, count_trailing_zeros
On 31/01/12 05:15, Richard Henderson wrote: I noticed this accidentally, while looking for something else. There are significant improvements in the DImode multiplication and division routines for armv4+. Despite how trivial this is, I assume this must wait for stage1. Ok? r~ * longlong.h [arm] (umul_ppmm): Use umull. [arm] (count_trailing_zeros): Use __builtin_ctz. armv3m also has the widening multiply operation (it's what the M stands for). Otherwise ok for stage1 R. diff --git a/libgcc/longlong.h b/libgcc/longlong.h index 30cc2e3..7204679 100644 --- a/libgcc/longlong.h +++ b/libgcc/longlong.h @@ -220,9 +220,12 @@ UDItype __umulsidi3 (USItype, USItype); rI ((USItype) (bh)), \ r ((USItype) (al)), \ rI ((USItype) (bl)) __CLOBBER_CC) -#define umul_ppmm(xh, xl, a, b) \ -{register USItype __t0, __t1, __t2; \ - __asm__ (%@ Inlined umul_ppmm\n \ +# if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \ + || defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__) +# define umul_ppmm(xh, xl, a, b)\ + do { \ +register USItype __t0, __t1, __t2; \ +__asm__ (%@ Inlined umul_ppmm\n \ mov %2, %5, lsr #16\n \ mov %0, %6, lsr #16\n \ bic %3, %5, %2, lsl #16\n \ @@ -239,14 +242,26 @@ UDItype __umulsidi3 (USItype, USItype); =r ((USItype) (xl)), \ =r (__t0), =r (__t1), =r (__t2)\ : r ((USItype) (a)), \ - r ((USItype) (b)) __CLOBBER_CC );} -#define UMUL_TIME 20 -#define UDIV_TIME 100 + r ((USItype) (b)) __CLOBBER_CC );\ + } while (0) +# define UMUL_TIME 20 +# else +# define umul_ppmm(xh, xl, a, b)\ + do { \ +/* Generate umull, under compiler control. */ \ +register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b);\ +(xl) = (USItype)__t0;\ +(xh) = (USItype)(__t0 32);\ + } while (0) +# define UMUL_TIME 3 +# endif +# define UDIV_TIME 100 #endif /* __arm__ */ #if defined(__arm__) /* Let gcc decide how best to implement count_leading_zeros. */ #define count_leading_zeros(COUNT,X) ((COUNT) = __builtin_clz (X)) +#define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctz (X)) #define COUNT_LEADING_ZEROS_0 32 #endif
[arm] Improve longlong.h umul_ppmm, count_trailing_zeros
I noticed this accidentally, while looking for something else. There are significant improvements in the DImode multiplication and division routines for armv4+. Despite how trivial this is, I assume this must wait for stage1. Ok? r~ * longlong.h [arm] (umul_ppmm): Use umull. [arm] (count_trailing_zeros): Use __builtin_ctz. diff --git a/libgcc/longlong.h b/libgcc/longlong.h index 30cc2e3..7204679 100644 --- a/libgcc/longlong.h +++ b/libgcc/longlong.h @@ -220,9 +220,12 @@ UDItype __umulsidi3 (USItype, USItype); rI ((USItype) (bh)), \ r ((USItype) (al)), \ rI ((USItype) (bl)) __CLOBBER_CC) -#define umul_ppmm(xh, xl, a, b) \ -{register USItype __t0, __t1, __t2;\ - __asm__ (%@ Inlined umul_ppmm\n\ +# if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \ + || defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__) +# define umul_ppmm(xh, xl, a, b) \ + do { \ +register USItype __t0, __t1, __t2; \ +__asm__ (%@ Inlined umul_ppmm\n \ mov %2, %5, lsr #16\n \ mov %0, %6, lsr #16\n \ bic %3, %5, %2, lsl #16\n \ @@ -239,14 +242,26 @@ UDItype __umulsidi3 (USItype, USItype); =r ((USItype) (xl)), \ =r (__t0), =r (__t1), =r (__t2)\ : r ((USItype) (a)), \ -r ((USItype) (b)) __CLOBBER_CC );} -#define UMUL_TIME 20 -#define UDIV_TIME 100 +r ((USItype) (b)) __CLOBBER_CC );\ + } while (0) +# define UMUL_TIME 20 +# else +# define umul_ppmm(xh, xl, a, b) \ + do { \ +/* Generate umull, under compiler control. */ \ +register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b); \ +(xl) = (USItype)__t0; \ +(xh) = (USItype)(__t0 32); \ + } while (0) +# define UMUL_TIME 3 +# endif +# define UDIV_TIME 100 #endif /* __arm__ */ #if defined(__arm__) /* Let gcc decide how best to implement count_leading_zeros. */ #define count_leading_zeros(COUNT,X) ((COUNT) = __builtin_clz (X)) +#define count_trailing_zeros(COUNT,X) ((COUNT) = __builtin_ctz (X)) #define COUNT_LEADING_ZEROS_0 32 #endif