Re: [PATCH] sched/cputime: make scale_stime() more precise

Peter Zijlstra Fri, 19 Jul 2019 04:04:55 -0700

On Thu, Jul 18, 2019 at 03:18:34PM +0200, Oleg Nesterov wrote:
> People report that utime and stime from /proc/<pid>/stat become very wrong
> when the numbers are big enough. In particular, the monitored application
> can run all the time in user-space but only stime grows.
> 
> This is because scale_stime() is very inaccurate. It tries to minimize the
> relative error, but the absolute error can be huge.
> 
> Andrew wrote the test-case:
> 
>       int main(int argc, char **argv)
>       {
>           struct task_cputime c;
>           struct prev_cputime p;
>           u64 st, pst, cst;
>           u64 ut, put, cut;
>           u64 x;
>           int i = -1; // one step not printed
> 
>           if (argc != 2)
>           {
>               printf("usage: %s <start_in_seconds>\n", argv[0]);
>               return 1;
>           }
>           x = strtoull(argv[1], NULL, 0) * SEC;
>           printf("start=%lld\n", x);
> 
>           p.stime = 0;
>           p.utime = 0;
> 
>           while (i++ < NSTEPS)
>           {
>               x += STEP;
>               c.stime = x;
>               c.utime = x;
>               c.sum_exec_runtime = x + x;
>               pst = cputime_to_clock_t(p.stime);
>               put = cputime_to_clock_t(p.utime);
>               cputime_adjust(&c, &p, &ut, &st);
>               cst = cputime_to_clock_t(st);
>               cut = cputime_to_clock_t(ut);
>               if (i)
>                   printf("ut(diff)/st(diff): %20lld (%4lld)  %20lld 
> (%4lld)\n",
>                       cut, cut - put, cst, cst - pst);
>           }
>       }
> 
> For example,
> 
>       $ ./stime 300000
>       start=300000000000000
>       ut(diff)/st(diff):            299994875 (   0)             300009124 
> (2000)
>       ut(diff)/st(diff):            299994875 (   0)             300011124 
> (2000)
>       ut(diff)/st(diff):            299994875 (   0)             300013124 
> (2000)
>       ut(diff)/st(diff):            299994875 (   0)             300015124 
> (2000)
>       ut(diff)/st(diff):            299994875 (   0)             300017124 
> (2000)
>       ut(diff)/st(diff):            299994875 (   0)             300019124 
> (2000)
>       ut(diff)/st(diff):            299994875 (   0)             300021124 
> (2000)
>       ut(diff)/st(diff):            299994875 (   0)             300023124 
> (2000)
>       ut(diff)/st(diff):            299994875 (   0)             300025124 
> (2000)
>       ut(diff)/st(diff):            299994875 (   0)             300027124 
> (2000)
>       ut(diff)/st(diff):            299994875 (   0)             300029124 
> (2000)
>       ut(diff)/st(diff):            299996875 (2000)             300029124 (  
>  0)
>       ut(diff)/st(diff):            299998875 (2000)             300029124 (  
>  0)
>       ut(diff)/st(diff):            300000875 (2000)             300029124 (  
>  0)
>       ut(diff)/st(diff):            300002875 (2000)             300029124 (  
>  0)
>       ut(diff)/st(diff):            300004875 (2000)             300029124 (  
>  0)
>       ut(diff)/st(diff):            300006875 (2000)             300029124 (  
>  0)
>       ut(diff)/st(diff):            300008875 (2000)             300029124 (  
>  0)
>       ut(diff)/st(diff):            300010875 (2000)             300029124 (  
>  0)
>       ut(diff)/st(diff):            300012055 (1180)             300029944 ( 
> 820)
>       ut(diff)/st(diff):            300012055 (   0)             300031944 
> (2000)
>       ut(diff)/st(diff):            300012055 (   0)             300033944 
> (2000)
>       ut(diff)/st(diff):            300012055 (   0)             300035944 
> (2000)
>       ut(diff)/st(diff):            300012055 (   0)             300037944 
> (2000)
> 
> shows the problem even when sum_exec_runtime is not that big: 300000 secs.
> 
> The new implementation of scale_stime() does the additional div64_u64_rem()
> in a loop but see the comment, as long it is used by cputime_adjust() this
> can happen only once.


That only shows something after long long staring :/ There's no words on
what the output actually means or what would've been expected.

Also, your example is incomplete; the below is a test for scale_stime();
from this we can see that the division results in too large a number,
but, important for our use-case in cputime_adjust(), it is a step
function (due to loss in precision) and for every plateau we shift
runtime into the wrong bucket.

Your proposed function works; but is atrocious, esp. on 32bit. That
said, before we 'fixed' it, it had similar horrible divisions in, see
commit 55eaa7c1f511 ("sched: Avoid cputime scaling overflow").

Included below is also an x86_64 implementation in 2 instructions.

I'm still trying see if there's anything saner we can do...

---
#include <stdio.h>
#include <stdlib.h>

#define   noinline                      __attribute__((__noinline__))

typedef unsigned long long u64;
typedef unsigned int u32;

static noinline u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
{
        u64 q;
        asm ("mulq %2; divq %3" : "=a" (q) : "a" (a), "rm" (b), "rm" (c) : 
"rdx");
        return q;
}

static u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder);

static inline u64 div_u64(u64 dividend, u32 divisor)
{
        u32 remainder;
        return div_u64_rem(dividend, divisor, &remainder);
}

static __always_inline int fls(unsigned int x)
{
        return x ? sizeof(x) * 8 - __builtin_clz(x) : 0;
}

#if 0
static u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
        union {
                u64 v64;
                u32 v32[2];
        } d = { dividend };
        u32 upper;

        upper = d.v32[1];
        d.v32[1] = 0;
        if (upper >= divisor) {
                d.v32[1] = upper / divisor;
                upper %= divisor;
        }
        asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
                "rm" (divisor), "0" (d.v32[0]), "1" (upper));
        return d.v64;
}
static u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
        u32 high = divisor >> 32;
        u64 quot;

        if (high == 0) {
                u32 rem32;
                quot = div_u64_rem(dividend, divisor, &rem32);
                *remainder = rem32;
        } else {
                int n = fls(high);
                quot = div_u64(dividend >> n, divisor >> n);

                if (quot != 0)
                        quot--;

                *remainder = dividend - quot * divisor;
                if (*remainder >= divisor) {
                        quot++;
                        *remainder -= divisor;
                }
        }

        return quot;
}
static u64 div64_u64(u64 dividend, u64 divisor)
{
        u32 high = divisor >> 32;
        u64 quot;

        if (high == 0) {
                quot = div_u64(dividend, divisor);
        } else {
                int n = fls(high);
                quot = div_u64(dividend >> n, divisor >> n);

                if (quot != 0)
                        quot--;
                if ((dividend - quot * divisor) >= divisor)
                        quot++;
        }

        return quot;
}
#else
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
        *remainder = dividend % divisor;
        return dividend / divisor;
}
static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
{
        *remainder = dividend % divisor;
        return dividend / divisor;
}
static inline u64 div64_u64(u64 dividend, u64 divisor)
{
        return dividend / divisor;
}
#endif

static __always_inline int fls64(u64 x)
{
        int bitpos = -1;
        /*
         * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
         * value is written to set it to the same as before.
         */
        asm("bsrq %1,%q0"
            : "+r" (bitpos)
            : "rm" (x));
        return bitpos + 1;
}

static inline int ilog2(u64 n)
{
        return fls64(n) - 1;
}

#define swap(a, b) \
        do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

static noinline u64 scale_stime(u64 stime, u64 rtime, u64 total)
{
        u64 scaled;

        for (;;) {
                /* Make sure "rtime" is the bigger of stime/rtime */
                if (stime > rtime)
                        swap(rtime, stime);

                /* Make sure 'total' fits in 32 bits */
                if (total >> 32)
                        goto drop_precision;

                /* Does rtime (and thus stime) fit in 32 bits? */
                if (!(rtime >> 32))
                        break;

                /* Can we just balance rtime/stime rather than dropping bits? */
                if (stime >> 31)
                        goto drop_precision;

                /* We can grow stime and shrink rtime and try to make them both 
fit */
                stime <<= 1;
                rtime >>= 1;
                continue;

drop_precision:
                /* We drop from rtime, it has more bits than stime */
                rtime >>= 1;
                total >>= 1;
        }

        /*
         * Make sure gcc understands that this is a 32x32->64 multiply,
         * followed by a 64/32->64 divide.
         */
        scaled = div_u64((stime * rtime), total);
        return scaled;
}

static noinline u64 oleg(u64 stime, u64 rtime, u64 total)
{
        u64 res = 0, div, rem;
        /* can stime * rtime overflow ? */
        while (ilog2(stime) + ilog2(rtime) > 62) {
                if (stime > rtime)
                        swap(rtime, stime);
                if (rtime >= total) {
                        /*
                         * (rtime * stime) / total is equal to
                         *
                         *      (rtime / total) * stime +
                         *      (rtime % total) * stime / total
                         *
                         * if nothing overflows. Can the 1st multiplication
                         * overflow? Yes, but we do not care: this can only
                         * happen if the end result can't fit in u64 anyway.
                         *
                         * So the code below does
                         *
                         *      res += (rtime / total) * stime;
                         *      rtime = rtime % total;
                         */
                        div = div64_u64_rem(rtime, total, &rem);
                        res += div * stime;
                        rtime = rem;
                        continue;
                }
                /* drop precision */
                rtime >>= 1;
                total >>= 1;
                if (!total)
                        return res;
        }
        return res + div64_u64(stime * rtime, total);
}

#define SEC     1000000000ULL

int main(int argc, char **argv)
{
        u64 u, s;
        u64 x;
        int i = -1; // one step not printed
        if (argc != 2)
        {
                printf("usage: %s <start_in_seconds>\n", argv[0]);
                return 1;
        }
        x = strtoull(argv[1], NULL, 0) * SEC;
        printf("start=%lld\n", x);

        for (i=0; i<50; i++, x += 2000) {
                printf("%lld = %lld * %lld / %lld\n", mul_u64_u64_div_u64(x, 
x+x, x+x), x, x+x, x+x);
                printf("%lld = %lld * %lld / %lld\n", scale_stime(x, x+x, x+x), 
x, x+x, x+x);
                printf("%lld = %lld * %lld / %lld\n", oleg(x, x+x, x+x), x, 
x+x, x+x);
                printf("---\n");
        }
}

Re: [PATCH] sched/cputime: make scale_stime() more precise

Reply via email to