+CC Claudiu: ARC gcc expert (please scroll to botom)

On Friday 30 October 2015 06:56 AM, Nicolas Pitre wrote:
> On Wed, 28 Oct 2015, Nicolas Pitre wrote:
>
>> On Thu, 29 Oct 2015, Alexey Brodkin wrote:
>>
>>> Fortunately we already have much better __div64_32() for 32-bit ARM.
>>> There in case of division by constant preprocessor calculates so-called
>>> "magic number" which is later used in multiplications instead of divisions.
>> It's not magic, it is science.  :-)
>>
>>> It's really nice and very optimal but obviously works only for ARM
>>> because ARM assembly is involved.
>>>
>>> Now why don't we extend the same approach to all other 32-bit arches
>>> with multiplication part implemented in pure C. With good compiler
>>> resulting assembly will be quite close to manually written assembly.
> Well... not as close at least on ARM.  Maybe 2x to 3x more costly than 
> the one with assembly.  Still better than 100x or so without this 
> optimization.
>
>>> But there's at least 1 problem which I don't know how to solve.
>>> Preprocessor magic only happens if __div64_32() is inlined (that's
>>> obvious - preprocessor has to know if divider is constant or not).
>>>
>>> But __div64_32() is already marked as weak function (which in its turn
>>> is required to allow some architectures to provide its own optimal
>>> implementations). I.e. addition of "inline" for __div64_32() is not an
>>> option.
>> You can't inline __div64_32().  It should remain as is and used only for 
>> the slow path.
>>
>> For the constant based optimization to work, you need to modify do_div() 
>> in include/asm-generic/div64.h directly.
> OK... I was intrigued, so I adapted my ARM code to the generic case, 
> including the overflow avoidance optimizations.  Please have look and 
> tell me how this works for you.
>
> If this patch is accepted upstream, then it could be possible to 
> abstract only the actual multiplication part with some architecture 
> specific assembly.
>
> diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
> index 8f4e319334..43c3b21dca 100644
> --- a/include/asm-generic/div64.h
> +++ b/include/asm-generic/div64.h
> @@ -32,6 +32,149 @@
>  
>  #elif BITS_PER_LONG == 32
>  
> +/* macroized fls implementation to ensure proper constant propagation */
> +#define __div64_fls(bits)                                            \
> +({                                                           \
> +     unsigned int __left = (bits), __nr = 0;                 \
> +     if (__left & 0xffff0000) __nr += 16, __left >>= 16;     \
> +     if (__left & 0x0000ff00) __nr +=  8, __left >>=  8;     \
> +     if (__left & 0x000000f0) __nr +=  4, __left >>=  4;     \
> +     if (__left & 0x0000000c) __nr +=  2, __left >>=  2;     \
> +     if (__left & 0x00000002) __nr +=  1;                    \
> +     __nr;                                                   \
> +})
> +
> +/*
> + * If the divisor happens to be constant, we determine the appropriate
> + * inverse at compile time to turn the division into a few inline
> + * multiplications which ought to be much faster. And yet only if compiling
> + * with a sufficiently recent gcc version to perform proper 64-bit constant
> + * propagation.
> + *
> + * (It is unfortunate that gcc doesn't perform all this internally.)
> + */
> +#define __div64_const32(n, ___b)                                             
> \
> +({                                                                   \
> +     /*                                                              \
> +      * Multiplication by reciprocal of b: n / b = n * (p / b) / p   \
> +      *                                                              \
> +      * We rely on the fact that most of this code gets optimized    \
> +      * away at compile time due to constant propagation and only    \
> +      * a few multiplication instructions should remain.             \
> +      * Hence this monstrous macro (static inline doesn't always     \
> +      * do the trick for some reason).                               \
> +      */                                                             \
> +     uint64_t ___res, ___x, ___t, ___m, ___n = (n);                  \
> +     uint32_t ___c, ___p, ___m_lo, ___m_hi, ___n_lo, ___n_hi;        \
> +                                                                     \
> +     /* determine number of bits to represent b */                   \
> +     ___p = 1 << __div64_fls(___b);                                  \
> +                                                                     \
> +     /* compute m = ((p << 64) + b - 1) / b */                       \
> +     ___m = (~0ULL / ___b) * ___p;                                   \
> +     ___m += (((~0ULL % ___b + 1) * ___p) + ___b - 1) / ___b;        \
> +                                                                     \
> +     /* dividend that produces one less than the highest result */   \
> +     ___x = ~0ULL / ___b * ___b - 1;                                 \
> +                                                                     \
> +     /* test our m with res = m * x / (p << 64) */                   \
> +     ___res = ((___m & 0xffffffff) * (___x & 0xffffffff)) >> 32;     \
> +     ___t = ___res += (___m & 0xffffffff) * (___x >> 32);            \
> +     ___res += (___x & 0xffffffff) * (___m >> 32);                   \
> +     ___t = (___res < ___t) ? (1ULL << 32) : 0;                      \
> +     ___res = (___res >> 32) + ___t;                                 \
> +     ___res += (___m >> 32) * (___x >> 32);                          \
> +     ___res /= ___p;                                                 \
> +                                                                     \
> +     /* Now sanitize and optimize what we've got. */                 \
> +     if (~0ULL % (___b / (___b & -___b)) == 0) {                     \
> +             /* those cases can be simplified with: */               \
> +             ___n /= (___b & -___b);                                 \
> +             ___m = ~0ULL / (___b / (___b & -___b));                 \
> +             ___p = 1;                                               \
> +             ___c = 1;                                               \
> +     } else if (___res != ___x / ___b) {                             \
> +             /*                                                      \
> +              * We can't get away without a correction to compensate \
> +              * for bit truncation errors.  To avoid it we'd need an \
> +              * additional bit to represent m which would overflow   \
> +              * our 64-bit variable.                                 \
> +              *                                                      \
> +              * Instead we do m = p / b and n / b = (n * m + m) / p. \
> +              */                                                     \
> +             ___c = 1;                                               \
> +             /* Compute m = (p << 64) / b */                         \
> +             ___m = (~0ULL / ___b) * ___p;                           \
> +             ___m += ((~0ULL % ___b + 1) * ___p) / ___b;             \
> +     } else {                                                        \
> +             /*                                                      \
> +              * Reduce m / p, and try to clear bit 31 of m when      \
> +              * possible, otherwise that'll need extra overflow      \
> +              * handling later.                                      \
> +              */                                                     \
> +             unsigned int ___bits = -(___m & -___m);                 \
> +             ___bits |= ___m >> 32;                                  \
> +             ___bits = (~___bits) << 1;                              \
> +             /*                                                      \
> +              * If ___bits == 0 then setting bit 31 is  unavoidable. \
> +              * Simply apply the maximum possible reduction in that  \
> +              * case. Otherwise the MSB of ___bits indicates the     \
> +              * best reduction we should apply.                      \
> +              */                                                     \
> +             if (!___bits) {                                         \
> +                     ___p /= (___m & -___m);                         \
> +                     ___m /= (___m & -___m);                         \
> +             } else {                                                \
> +                     ___p >>= __div64_fls(___bits);                  \
> +                     ___m >>= __div64_fls(___bits);                  \
> +             }                                                       \
> +             /* No correction needed. */                             \
> +             ___c = 0;                                               \
> +     }                                                               \
> +                                                                     \
> +     /*                                                              \
> +      * Now we have a combination of 2 conditions:                   \
> +      *                                                              \
> +      * 1) whether or not we need a correction (___c), and           \
> +      *                                                              \
> +      * 2) whether or not there might be an overflow in the cross    \
> +      *    product determined by (___m & ((1 << 63) | (1 << 31))).   \
> +      *                                                              \
> +      * Select the best way to do the m * n / (p << 64) operation.   \
> +      * From here on there will be actual runtime code generated.    \
> +      */                                                             \
> +                                                                     \
> +     ___m_lo = ___m;                                                 \
> +     ___m_hi = ___m >> 32;                                           \
> +     ___n_lo = ___n;                                                 \
> +     ___n_hi = ___n >> 32;                                           \
> +                                                                     \
> +     if (!___c) {                                                    \
> +             ___res = ((uint64_t)___m_lo * ___n_lo) >> 32;           \
> +     } else if (!(___m & ((1ULL << 63) | (1ULL << 31)))) {           \
> +             ___res = (___m + (uint64_t)___m_lo * ___n_lo) >> 32;    \
> +     } else {                                                        \
> +             ___res = ___m + (uint64_t)___m_lo * ___n_lo;            \
> +             ___t = (___res < ___m) ? (1ULL << 32) : 0;              \
> +             ___res = (___res >> 32) + ___t;                         \
> +     }                                                               \
> +                                                                     \
> +     if (!(___m & ((1ULL << 63) | (1ULL << 31)))) {                  \
> +             ___res += (uint64_t)___m_lo * ___n_hi;                  \
> +             ___res += (uint64_t)___m_hi * ___n_lo;                  \
> +             ___res >>= 32;                                          \
> +     } else {                                                        \
> +             ___t = ___res += (uint64_t)___m_lo * ___n_hi;           \
> +             ___res += (uint64_t)___m_hi * ___n_lo;                  \
> +             ___t = (___res < ___t) ? (1ULL << 32) : 0;              \
> +             ___res = (___res >> 32) + ___t;                         \
> +     }                                                               \
> +                                                                     \
> +     ___res += (uint64_t)___m_hi * ___n_hi;                          \
> +                                                                     \
> +     ___res / ___p;                                                  \
> +})
> +
>  extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);
>  
>  /* The unnecessary pointer compare is there
> @@ -41,7 +184,20 @@ extern uint32_t __div64_32(uint64_t *dividend, uint32_t 
> divisor);
>       uint32_t __base = (base);                       \
>       uint32_t __rem;                                 \
>       (void)(((typeof((n)) *)0) == ((uint64_t *)0));  \
> -     if (likely(((n) >> 32) == 0)) {                 \
> +     if (__builtin_constant_p(__base) &&             \
> +         (__base & (__base - 1)) == 0) {             \
> +             /* constant power of 2: gcc is fine */  \
> +             __rem = (n) & (__base - 1);             \
> +             (n) /= __base;                          \
> +     } else if ((__GNUC__ >= 4) &&                   \
> +         __builtin_constant_p(__base) &&             \
> +         __base != 0) {                              \
> +             uint32_t __res_lo, __n_lo = (n);        \
> +             (n) = __div64_const32(n, __base);       \
> +             /* the remainder can be computed with 32-bit regs */ \
> +             __res_lo = (n);                         \
> +             __rem = __n_lo - __res_lo * __base;     \
> +     } else if (likely(((n) >> 32) == 0)) {          \
>               __rem = (uint32_t)(n) % __base;         \
>               (n) = (uint32_t)(n) / __base;           \
>       } else                                          \
>

Claudiu, can some of this not be done in gcc itself !

-Vineet
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to