On 31/01/12 05:15, Richard Henderson wrote:
> I noticed this accidentally, while looking for something else.
> There are significant improvements in the DImode multiplication
> and division routines for armv4+.
> 
> Despite how trivial this is, I assume this must wait for stage1.
> Ok?
> 
> 
> r~
> 
> 
>       * longlong.h [arm] (umul_ppmm): Use umull.
>       [arm] (count_trailing_zeros): Use __builtin_ctz.

armv3m also has the widening multiply operation (it's what the M stands
for).

Otherwise ok for stage1

R.

> 
> diff --git a/libgcc/longlong.h b/libgcc/longlong.h
> index 30cc2e3..7204679 100644
> --- a/libgcc/longlong.h
> +++ b/libgcc/longlong.h
> @@ -220,9 +220,12 @@ UDItype __umulsidi3 (USItype, USItype);
>            "rI" ((USItype) (bh)),                                     \
>            "r" ((USItype) (al)),                                      \
>            "rI" ((USItype) (bl)) __CLOBBER_CC)
> -#define umul_ppmm(xh, xl, a, b) \
> -{register USItype __t0, __t1, __t2;                                  \
> -  __asm__ ("%@ Inlined umul_ppmm\n"                                  \
> +# if defined(__ARM_ARCH_2__) || defined(__ARM_ARCH_2A__) \
> +     || defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__)
> +#  define umul_ppmm(xh, xl, a, b)                                    \
> +  do {                                                                       
> \
> +    register USItype __t0, __t1, __t2;                                       
> \
> +    __asm__ ("%@ Inlined umul_ppmm\n"                                        
> \
>          "    mov     %2, %5, lsr #16\n"                              \
>          "    mov     %0, %6, lsr #16\n"                              \
>          "    bic     %3, %5, %2, lsl #16\n"                          \
> @@ -239,14 +242,26 @@ UDItype __umulsidi3 (USItype, USItype);
>            "=r" ((USItype) (xl)),                                     \
>            "=&r" (__t0), "=&r" (__t1), "=r" (__t2)                    \
>          : "r" ((USItype) (a)),                                       \
> -          "r" ((USItype) (b)) __CLOBBER_CC );}
> -#define UMUL_TIME 20
> -#define UDIV_TIME 100
> +          "r" ((USItype) (b)) __CLOBBER_CC );                        \
> +  } while (0)
> +#  define UMUL_TIME 20
> +# else
> +#  define umul_ppmm(xh, xl, a, b)                                    \
> +  do {                                                                       
> \
> +    /* Generate umull, under compiler control.  */                   \
> +    register UDItype __t0 = (UDItype)(USItype)(a) * (USItype)(b);    \
> +    (xl) = (USItype)__t0;                                            \
> +    (xh) = (USItype)(__t0 >> 32);                                    \
> +  } while (0)
> +#  define UMUL_TIME 3
> +# endif
> +# define UDIV_TIME 100
>  #endif /* __arm__ */
>  
>  #if defined(__arm__)
>  /* Let gcc decide how best to implement count_leading_zeros.  */
>  #define count_leading_zeros(COUNT,X) ((COUNT) = __builtin_clz (X))
> +#define count_trailing_zeros(COUNT,X)   ((COUNT) = __builtin_ctz (X))
>  #define COUNT_LEADING_ZEROS_0 32
>  #endif
>  
> 


Reply via email to