I want to read this even more, but here's a dump of my comments so far...

> 1) Cascading is avoided (except for extreme long time timers)

> + * Note: This implementation might be suboptimal vs. timers enqueued in the
> + *    cascade level because we do not look at the timers to figure out when
> + *    they really expire. So for now, we just treat the cascading timers
> + *    like any other timer. If each cascading bucket has a timer, we wake
> + *    up with the granularity of the last level.

You've eliminated cascading entirely, so these comments are stale, no?

> +# define BASE_RND_DN(n)        ((n) & ~BASE_MASK)
> +# define BASE_RND_UP(n)        (BASE_RND_DN(n) + BASE_INCR)

Er... is this correct?  Usually I'd expect the result of rounding up
to occasionally be equal to the original (e.g. BASE_RND_UP(0) == 0), but
this doesn't have that property.

Given that you don't use BASE_RND_DN anywhere, maybe shrink this to one 
definition?


Looking at the __next_timer_interrupt function, it seems that it does
a lot more work than necessary.  Once a timeout has been found in the
current level, the range which must be searched in the following level
is limited to 1/LVL_CLK_DIV of the range in the current level.

That quickly tapers off to zero and the search can stop.

In particular, if a timeout is found at level 0 between the immediately
next bucket and the next bucket which is a multiple of LEVEL_SHIFT_DIV,
inclusive (1 <= x <= 8 buckets depending on the sbits of base->clk),
then the search can stop immediately.


This is hairy code and the following untested code is probably buggy,
but the basic idea is:

/*
 * Search span bits beginning at (offset + clk) for a set bit, wrapping
 * at the end of the level.  Return the position of the bit relative to
 * (offset + clk), or >= span if none.
 */
static unsigned next_pending_bucket(struct timer_base *base, unsigned offset,
        unsigned clk, unsigned span)
{
        unsigned pos;

        if (clk + span <= LVL_SIZE) {
                /* No wrap, simple search */
                clk += offset;
                return find_next_bit(base->pending_map, clk + span, clk);
                return pos - clk;
        } else {
                /* Search wraps */
                clk += offset;
                pos = find_next_bit(base->pending_map, offset + LVL_SIZE, clk);
                if (pos >= offset + LVL_SIZE)
                        return pos - clk;
                clk -= LVL_SIZE;
                pos = find_next_bit(base->pending_map, clk + span, offset);
                return pos - clk;
        }
}

/* Find the next expiring timer list >= base->clk */
static unsigned long __next_timer_interrupt(struct timer_base *base)
{
        unsigned long clk, end, next;
        unsigned lvl, offset. bit;

        /* Phase 1: Find the starting level */
        bit = find_first_bit(base->pending_map, WHEEL_SIZE);
        if (unlkely(bit >= WHEEL_SIZE)) {
                /* No pending timers */
                next = base->clk + NEXT_TIMER_MAX_DELTA;
                goto done;
        }
        lvl = (unsigned)bit / LVL_SIZE;
        clk = (base->clk + LVL_GRAN(lvl) - 1) >> LVL_SHIFT(lvl);
        offset = (bit | LVL_MASK) + 1;  /* End of the current level */

        /* Phase 2: Find the next-expiring list in this level */
        if ((clk & LVL_MASK) > (bit & LVL_MASK)) {
                unsigned b = offset - LVL_SIZE + (clk & LVL_MASK);

                b = find_next_bit(base->pending_map, offset, b);
                if (b < offset)
                        bit = b;
        }
        end = clk + ((bit - clk) & LVL_MASK);   /* The next expiration time */
        next = end << LVL_SHIFT(lvl);

        /*
         * At this point, clk is the current time, in units of the current
         * level's granularity, and rounded up.  end is the time of the
         * earliest expiration found so far, in the same units and rounded
         * down.  next is the unrounded expiration time in jiffies.
         *
         * Phase 3: Search higher levels for expirations in [clk, end).
         */
        while (++lvl < LVL_DEPTH) {
                unsigned b;

                clk = (clk + LVL_CLK_MASK) >> LVL_CLK_SHIFT;
                end >>= LVL_CLK_SHIFT;
                if (clk >= end)
                        break;
                b = next_pending_bucket(base, offset, clk & LVL_MASK, end-clk);
                if (b < end - clk) {
                        end = clk + b;
                        next = end << LVL_SHIFT(lvl);
                }
                offset += LVL_SIZE;
        }
done:
        spin_unlock(&base->lock);
        return next;
}

Reply via email to