Re: [Qemu-devel] [PATCH V3 for-1.6 2/5] block: Modify the throttling code to implement the leaky bucket algorithm.

Fam Zheng Thu, 25 Jul 2013 20:42:10 -0700

On Tue, 07/23 18:29, Benoît Canet wrote:
> This patch replace the previous algorithm by the well described leaky bucket
> algorithm: A bucket is filled by the incoming IOs and a periodic timer 
> decrement
> the counter to make the bucket leak. When a given threshold is reached the
> bucket is full and the IOs are hold.
> 
> In this patch the threshold is set to a default value to make the code behave
> like the previous implementation.
> 
> In the next patch the threshold will be exposed in QMP to let the user control
> the burstiness of the throttling.
> 
> Signed-off-by: Benoit Canet <ben...@irqsave.net>
> ---
>  block.c                   |  454 
> +++++++++++++++++++++++++++------------------
>  blockdev.c                |   71 +++++--
>  include/block/block_int.h |   15 +-
>  3 files changed, 339 insertions(+), 201 deletions(-)
> 
> diff --git a/block.c b/block.c
> index dc72643..f1cd9c0 100644
> --- a/block.c
> +++ b/block.c
> @@ -86,13 +86,6 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque);
>  static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
>      int64_t sector_num, int nb_sectors);
>  
> -static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> -        bool is_write, double elapsed_time, uint64_t *wait);
> -static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> -        double elapsed_time, uint64_t *wait);
> -static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> -        bool is_write, int64_t *wait);
> -
>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>  
> @@ -101,6 +94,8 @@ static QLIST_HEAD(, BlockDriver) bdrv_drivers =
>  
>  /* If non-zero, use only whitelisted block drivers */
>  static int use_bdrv_whitelist;
> +/* boolean used to inform the throttling code that a bdrv_drain_all is 
> issued */
> +static bool draining;
>  
>  #ifdef _WIN32
>  static int is_windows_drive_prefix(const char *filename)
> @@ -129,28 +124,170 @@ void bdrv_io_limits_disable(BlockDriverState *bs)
>  
>      while (qemu_co_enter_next(&bs->throttled_reqs)) {
>      }
> +}
>  
> -    if (bs->block_timer) {
> -        qemu_del_timer(bs->block_timer);
> -        qemu_free_timer(bs->block_timer);
> -        bs->block_timer = NULL;
> +static void bdrv_make_bps_buckets_leak(BlockDriverState *bs, int64_t delta)
> +{
> +    int64_t *bytes = bs->leaky_buckets.bytes;
> +    int64_t read_leak, write_leak;
> +
> +    /* the limit apply to both reads and writes */
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        /* compute half the total leak */
> +        int64_t leak = ((bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL] * delta) /
> +                       NANOSECONDS_PER_SECOND);
> +        int remain = leak % 2;
> +        leak /= 2;
> +
> +        /* the read bucket is smaller than half the quantity to leak so take
> +         * care adding the leak difference to write leak
> +         */
> +        if (bytes[BLOCK_IO_LIMIT_READ] <= leak) {
> +            read_leak = bytes[BLOCK_IO_LIMIT_READ];
> +            write_leak = 2 * leak + remain - bytes[BLOCK_IO_LIMIT_READ];
> +        /* symetric case */
> +        } else if (bytes[BLOCK_IO_LIMIT_WRITE] <= leak) {
> +            write_leak = bytes[BLOCK_IO_LIMIT_WRITE];
> +            read_leak = 2 * leak + remain - bytes[BLOCK_IO_LIMIT_WRITE];
> +        /* both bucket above leak count use half the total leak for both */
> +        } else {
> +            write_leak = leak;
> +            read_leak = leak + remain;
> +        }
> +    /* else we consider that limits are separated */
> +    } else {
> +        read_leak = (bs->io_limits.bps[BLOCK_IO_LIMIT_READ] * delta) /
> +                    NANOSECONDS_PER_SECOND;
> +        write_leak = (bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE] * delta) /
> +                     NANOSECONDS_PER_SECOND;
> +    }
> +
> +    /* make the buckets leak */
> +    bytes[BLOCK_IO_LIMIT_READ]  = MAX(bytes[BLOCK_IO_LIMIT_READ] - read_leak,
> +                                      0);
> +    bytes[BLOCK_IO_LIMIT_WRITE] = MAX(bytes[BLOCK_IO_LIMIT_WRITE] - 
> write_leak,
> +                                      0);
> +}
> +
> +static void bdrv_make_iops_buckets_leak(BlockDriverState *bs, int64_t delta)
> +{
> +    double *ios = bs->leaky_buckets.ios;
> +    int64_t read_leak, write_leak;
> +
> +    /* the limit apply to both reads and writes */
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        /* compute half the total leak */
> +        int64_t leak = ((bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL] * delta) /
> +                       NANOSECONDS_PER_SECOND);


You have the total leak here...

> +        int remain = leak % 2;
> +        leak /= 2;
> +
> +        /* the read bucket is smaller than half the quantity to leak so take
> +         * care adding the leak difference to write leak
> +         */
> +        if (ios[BLOCK_IO_LIMIT_READ] <= leak) {
> +            read_leak = ios[BLOCK_IO_LIMIT_READ];
> +            write_leak = 2 * leak + remain - ios[BLOCK_IO_LIMIT_READ];
> +        /* symetric case */
> +        } else if (ios[BLOCK_IO_LIMIT_WRITE] <= leak) {
> +            write_leak = ios[BLOCK_IO_LIMIT_WRITE];
> +            read_leak = 2 * leak + remain - ios[BLOCK_IO_LIMIT_WRITE];
> +        /* both bucket above leak count use half the total leak for both */
> +        } else {
> +            write_leak = leak;
> +            read_leak = leak + remain;
> +        }


I think it is easier to understand written like this:

           int64_t total_leak = ((bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL] * 
delta) /
                          NANOSECONDS_PER_SECOND);
           if (ios[BLOCK_IO_LIMIT_READ] <= total_leak / 2) {
              read_leak = ios[BLOCK_IO_LIMIT_READ];
              write_leak = total_leak - read_leak;
           /* symetric case */
           } else if (ios[BLOCK_IO_LIMIT_WRITE] <= total_leak / 2) {
              write_leak = ios[BLOCK_IO_LIMIT_WRITE];
              read_leak = total_leak - write_leak;
           /* both bucket above leak count use half the total leak for both */
           } else {
              write_leak = total_leak / 2;
              read_leak = (total_leak + 1) / 2;
           }

> +    /* else we consider that limits are separated */
> +    } else {
> +        read_leak = (bs->io_limits.iops[BLOCK_IO_LIMIT_READ] * delta) /
> +                    NANOSECONDS_PER_SECOND;
> +        write_leak = (bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE] * delta) /
> +                     NANOSECONDS_PER_SECOND;
> +    }
> +
> +    /* make the buckets leak */
> +    ios[BLOCK_IO_LIMIT_READ]  = MAX(ios[BLOCK_IO_LIMIT_READ] - read_leak, 0);
> +    ios[BLOCK_IO_LIMIT_WRITE] = MAX(ios[BLOCK_IO_LIMIT_WRITE] - write_leak, 
> 0);
> +}
> +
> +static void bdrv_leak_if_needed(BlockDriverState *bs)
> +{
> +    int64_t now;
> +    int64_t delta;
> +
> +    if (!bs->must_leak) {
> +        return;
> +    }
> +
> +    bs->must_leak = false;
> +
> +    now = qemu_get_clock_ns(rt_clock);
> +    delta = now - bs->previous_leak;
> +    bs->previous_leak = now;
> +
> +    bdrv_make_bps_buckets_leak(bs, delta);
> +    bdrv_make_iops_buckets_leak(bs, delta);
> +}
> +
> +static void bdrv_block_timer_disable(BlockDriverState *bs)
> +{
> +    if (!bs->block_timer) {
> +        return;
>      }
>  
> -    bs->slice_start = 0;
> -    bs->slice_end   = 0;
> +    qemu_del_timer(bs->block_timer);
> +    qemu_free_timer(bs->block_timer);
> +    bs->block_timer = NULL;
> +}
> +
> +static bool bdrv_throttling_is_iddle(BlockDriverState *bs)

I don't quite understad the wording here, is iddle equivalent to idle?

> +{
> +    int64_t delta = qemu_get_clock_ns(rt_clock) - bs->previous_leak;
> +
> +    if (delta < BLOCK_IO_THROTTLE_PERIOD * 2) {
> +        return false;
> +    }
> +
> +    /* iddle */
> +    return true;
>  }
>  
> +/* This callback is the timer in charge of making the leaky buckets leak */
>  static void bdrv_block_timer(void *opaque)

Will be more readable for me if you could rename it to
bdrv_clock_timer_cb.

>  {
>      BlockDriverState *bs = opaque;
>  
> +    /* disable throttling time on iddle for economy purpose */
> +    if (bdrv_throttling_is_iddle(bs)) {
> +        bdrv_block_timer_disable(bs);
> +        return;
> +    }
> +
> +    /* rearm the timer */
> +    qemu_mod_timer(bs->block_timer,
> +                   qemu_get_clock_ns(vm_clock) +
> +                   BLOCK_IO_THROTTLE_PERIOD);
> +
> +    bs->must_leak = true;
>      qemu_co_enter_next(&bs->throttled_reqs);
>  }
>  
> +static void bdrv_block_timer_enable(BlockDriverState *bs)
> +{
> +    if (bs->block_timer) {
> +        return;
> +    }
> +
> +    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
> +    bs->previous_leak = qemu_get_clock_ns(rt_clock);
> +    qemu_mod_timer(bs->block_timer,
> +                   qemu_get_clock_ns(vm_clock) +
> +                   BLOCK_IO_THROTTLE_PERIOD);
> +}
> +
>  void bdrv_io_limits_enable(BlockDriverState *bs)
>  {
>      qemu_co_queue_init(&bs->throttled_reqs);
> -    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>      bs->io_limits_enabled = true;
>  }
>  
> @@ -165,15 +302,118 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>           || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>  }
>  
> +/* This function check if the correct bandwith threshold has been exceeded

What does the "correct bandwidth threshold" mean?

And s/bandwith/bandwidth/, series wide.

> + *
> + * @is_write: true if the current IO is a write, false if it's a read
> + * @ret:      true if threshold has been exceeded else false
> + */
> +static bool bdrv_is_bps_threshold_exceeded(BlockDriverState *bs, bool 
> is_write)
> +{
> +    /* limit is on total read + write bps : do the sum and compare with total
> +     * threshold
> +     */
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        int64_t bytes = bs->leaky_buckets.bytes[BLOCK_IO_LIMIT_READ] +
> +                        bs->leaky_buckets.bytes[BLOCK_IO_LIMIT_WRITE];
> +        return bs->io_limits.bps_threshold[BLOCK_IO_LIMIT_TOTAL] < bytes;
> +    }
> +
> +    /* check wether the threshold corresponding to the current io type (read,
> +     * write) has been exceeded
> +     */
> +    if (bs->io_limits.bps[is_write]) {

It looks dangerous to use is_write as index of the array.

> +        return bs->io_limits.bps_threshold[is_write] <
> +               bs->leaky_buckets.bytes[is_write];
> +    }
> +
> +    /* no limit */
> +    return false;
> +}
> +
> +/* This function check if the correct iops threshold has been exceeded
> + *
> + * @is_write: true if the current IO is a write, false if it's a read
> + * @ret:      true if threshold has been exceeded else false
> + */
> +static bool bdrv_is_iops_threshold_exceeded(BlockDriverState *bs, bool 
> is_write)
> +{
> +    /* limit is on total read + write iops : do the sum and compare with 
> total
> +     * threshold
> +     */
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        double ios = bs->leaky_buckets.ios[BLOCK_IO_LIMIT_READ] +
> +                     bs->leaky_buckets.ios[BLOCK_IO_LIMIT_WRITE];
> +        return bs->io_limits.iops_threshold[BLOCK_IO_LIMIT_TOTAL] < ios;
> +    }
> +
> +    /* check wether the threshold corresponding to the current io type (read,
> +     * write) has been exceeded
> +     */
> +    if (bs->io_limits.iops[is_write]) {
> +        return bs->io_limits.iops_threshold[is_write] <
> +               bs->leaky_buckets.ios[is_write];
> +    }
> +
> +    /* no limit */
> +    return false;
> +}
> +
> +/* This function check if any bandwith or iops threshold has been exceeded
> + *
> + * @nb_sectors: the number of sectors of the current IO
> + * @is_write:   true if the current IO is a write, false if it's a read
> + * @ret:        true if any threshold has been exceeded else false
> + */
> +static bool bdrv_is_any_threshold_exceeded(BlockDriverState *bs, int 
> nb_sectors,
> +                                           bool is_write)
> +{
> +    bool bps_ret, iops_ret;
> +
> +    /* check if any bandwith or per IO threshold has been exceeded */
> +    bps_ret = bdrv_is_bps_threshold_exceeded(bs, is_write);
> +    iops_ret = bdrv_is_iops_threshold_exceeded(bs, is_write);
> +
> +    /* if so the IO will be blocked so do not account it and return true
> +     * also return false if a bdrv_drain_all is in progress
> +     */
> +    if (!draining && (bps_ret || iops_ret)) {
> +        return true;
> +    }
> +
> +    /* NOTE: the counter can go above the threshold when authorizing an IO.
> +     *       At next call the code will punish the guest by blocking the
> +     *       next IO until the counter has been decremented below the 
> threshold.
> +     *       This way if a guest issue a jumbo IO bigger than the threshold 
> it
> +     *       will have a chance no be authorized and will not result in a 
> guest
> +     *       IO deadlock.
> +     */
> +
> +    /* the IO is authorized so do the accounting and return false */
> +    bs->leaky_buckets.bytes[is_write] += (int64_t)nb_sectors *
> +                                         BDRV_SECTOR_SIZE;
> +    bs->leaky_buckets.ios[is_write]++;
> +
> +    return false;
> +}
> +
>  static void bdrv_io_limits_intercept(BlockDriverState *bs,
>                                       bool is_write, int nb_sectors)
>  {
> -    int64_t wait_time = -1;
> +    /* enable block timer if needed when intercepting I/Os */
> +    if (!bs->block_timer) {

Already checking for bs->block_timer in bdrv_block_timer_enable().

> +        bdrv_block_timer_enable(bs);
> +    }
>  
> +    bdrv_leak_if_needed(bs);
> +    /* if some IOs are already queued because the bucket is full put the 
> current
> +     * IO at the end of the queue (FIFO)
> +     */
>      if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
>          qemu_co_queue_wait(&bs->throttled_reqs);
>      }
>  
> +    bdrv_leak_if_needed(bs);
> +
>      /* In fact, we hope to keep each request's timing, in FIFO mode. The next
>       * throttled requests will not be dequeued until the current request is
>       * allowed to be serviced. So if the current request still exceeds the
> @@ -181,13 +421,19 @@ static void bdrv_io_limits_intercept(BlockDriverState 
> *bs,
>       * be still in throttled_reqs queue.
>       */
>  
> -    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
> -        qemu_mod_timer(bs->block_timer,
> -                       wait_time + qemu_get_clock_ns(vm_clock));
> +    /* if a threshold is exceeded the leaky bucket is full so the code put 
> the
> +     * IO in the throttle_reqs queue until the bucket has leaked enough to be
> +     * not full
> +     */
> +    while (bdrv_is_any_threshold_exceeded(bs, nb_sectors, is_write)) {
> +        bdrv_leak_if_needed(bs);
>          qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
> +        bdrv_leak_if_needed(bs);
>      }
>  
> +    bdrv_leak_if_needed(bs);
>      qemu_co_queue_next(&bs->throttled_reqs);
> +    bdrv_leak_if_needed(bs);
>  }
>  
>  /* check if the path starts with "<protocol>:" */
> @@ -1439,6 +1685,9 @@ void bdrv_drain_all(void)
>      BlockDriverState *bs;
>      bool busy;
>  
> +    /* tell the throttling code we are draining */
> +    draining = true;
> +
>      do {
>          busy = qemu_aio_wait();
>  
> @@ -1457,6 +1706,8 @@ void bdrv_drain_all(void)
>          assert(QLIST_EMPTY(&bs->tracked_requests));
>          assert(qemu_co_queue_empty(&bs->throttled_reqs));
>      }
> +
> +    draining = false;
>  }
>  
>  /* make a BlockDriverState anonymous by removing from bdrv_state list.
> @@ -1492,9 +1743,7 @@ static void bdrv_move_feature_fields(BlockDriverState 
> *bs_dest,
>      bs_dest->enable_write_cache = bs_src->enable_write_cache;
>  
>      /* i/o timing parameters */
> -    bs_dest->slice_start        = bs_src->slice_start;
> -    bs_dest->slice_end          = bs_src->slice_end;
> -    bs_dest->slice_submitted    = bs_src->slice_submitted;
> +    bs_dest->leaky_buckets      = bs_src->leaky_buckets;
>      bs_dest->io_limits          = bs_src->io_limits;
>      bs_dest->throttled_reqs     = bs_src->throttled_reqs;
>      bs_dest->block_timer        = bs_src->block_timer;
> @@ -3551,169 +3800,6 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>      acb->aiocb_info->cancel(acb);
>  }
>  
> -/* block I/O throttling */
> -static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> -                 bool is_write, double elapsed_time, uint64_t *wait)
> -{
> -    uint64_t bps_limit = 0;
> -    uint64_t extension;
> -    double   bytes_limit, bytes_base, bytes_res;
> -    double   slice_time, wait_time;
> -
> -    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> -        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
> -    } else if (bs->io_limits.bps[is_write]) {
> -        bps_limit = bs->io_limits.bps[is_write];
> -    } else {
> -        if (wait) {
> -            *wait = 0;
> -        }
> -
> -        return false;
> -    }
> -
> -    slice_time = bs->slice_end - bs->slice_start;
> -    slice_time /= (NANOSECONDS_PER_SECOND);
> -    bytes_limit = bps_limit * slice_time;
> -    bytes_base  = bs->slice_submitted.bytes[is_write];
> -    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> -        bytes_base += bs->slice_submitted.bytes[!is_write];
> -    }
> -
> -    /* bytes_base: the bytes of data which have been read/written; and
> -     *             it is obtained from the history statistic info.
> -     * bytes_res: the remaining bytes of data which need to be read/written.
> -     * (bytes_base + bytes_res) / bps_limit: used to calcuate
> -     *             the total time for completing reading/writting all data.
> -     */
> -    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> -
> -    if (bytes_base + bytes_res <= bytes_limit) {
> -        if (wait) {
> -            *wait = 0;
> -        }
> -
> -        return false;
> -    }
> -
> -    /* Calc approx time to dispatch */
> -    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
> -
> -    /* When the I/O rate at runtime exceeds the limits,
> -     * bs->slice_end need to be extended in order that the current statistic
> -     * info can be kept until the timer fire, so it is increased and tuned
> -     * based on the result of experiment.
> -     */
> -    extension = wait_time * NANOSECONDS_PER_SECOND;
> -    extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) *
> -                BLOCK_IO_SLICE_TIME;
> -    bs->slice_end += extension;
> -    if (wait) {
> -        *wait = wait_time * NANOSECONDS_PER_SECOND;
> -    }
> -
> -    return true;
> -}
> -
> -static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> -                             double elapsed_time, uint64_t *wait)
> -{
> -    uint64_t iops_limit = 0;
> -    double   ios_limit, ios_base;
> -    double   slice_time, wait_time;
> -
> -    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> -        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
> -    } else if (bs->io_limits.iops[is_write]) {
> -        iops_limit = bs->io_limits.iops[is_write];
> -    } else {
> -        if (wait) {
> -            *wait = 0;
> -        }
> -
> -        return false;
> -    }
> -
> -    slice_time = bs->slice_end - bs->slice_start;
> -    slice_time /= (NANOSECONDS_PER_SECOND);
> -    ios_limit  = iops_limit * slice_time;
> -    ios_base   = bs->slice_submitted.ios[is_write];
> -    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> -        ios_base += bs->slice_submitted.ios[!is_write];
> -    }
> -
> -    if (ios_base + 1 <= ios_limit) {
> -        if (wait) {
> -            *wait = 0;
> -        }
> -
> -        return false;
> -    }
> -
> -    /* Calc approx time to dispatch, in seconds */
> -    wait_time = (ios_base + 1) / iops_limit;
> -    if (wait_time > elapsed_time) {
> -        wait_time = wait_time - elapsed_time;
> -    } else {
> -        wait_time = 0;
> -    }
> -
> -    /* Exceeded current slice, extend it by another slice time */
> -    bs->slice_end += BLOCK_IO_SLICE_TIME;
> -    if (wait) {
> -        *wait = wait_time * NANOSECONDS_PER_SECOND;
> -    }
> -
> -    return true;
> -}
> -
> -static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> -                           bool is_write, int64_t *wait)
> -{
> -    int64_t  now, max_wait;
> -    uint64_t bps_wait = 0, iops_wait = 0;
> -    double   elapsed_time;
> -    int      bps_ret, iops_ret;
> -
> -    now = qemu_get_clock_ns(vm_clock);
> -    if (now > bs->slice_end) {
> -        bs->slice_start = now;
> -        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
> -        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
> -    }
> -
> -    elapsed_time  = now - bs->slice_start;
> -    elapsed_time  /= (NANOSECONDS_PER_SECOND);
> -
> -    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
> -                                      is_write, elapsed_time, &bps_wait);
> -    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
> -                                      elapsed_time, &iops_wait);
> -    if (bps_ret || iops_ret) {
> -        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
> -        if (wait) {
> -            *wait = max_wait;
> -        }
> -
> -        now = qemu_get_clock_ns(vm_clock);
> -        if (bs->slice_end < now + max_wait) {
> -            bs->slice_end = now + max_wait;
> -        }
> -
> -        return true;
> -    }
> -
> -    if (wait) {
> -        *wait = 0;
> -    }
> -
> -    bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors *
> -                                           BDRV_SECTOR_SIZE;
> -    bs->slice_submitted.ios[is_write]++;
> -
> -    return false;
> -}
> -
>  /**************************************************************/
>  /* async block device emulation */
>  
> diff --git a/blockdev.c b/blockdev.c
> index c5abd65..491e4d0 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -280,10 +280,25 @@ static int parse_block_error_action(const char *buf, 
> bool is_read)
>      }
>  }
>  
> +static bool check_io_limit(int64_t limit)
> +{
> +    if (!limit) {
> +        return false;
> +    }
> +
> +    if (limit < (THROTTLE_HZ * 2)) {
> +        return true;
> +    }
> +
> +    return false;
> +}
> +
>  static bool do_check_io_limits(BlockIOLimit *io_limits, Error **errp)
>  {
>      bool bps_flag;
>      bool iops_flag;
> +    bool bps_threshold_flag;
> +    bool iops_threshold_flag;
>  
>      assert(io_limits);
>  
> @@ -299,13 +314,30 @@ static bool do_check_io_limits(BlockIOLimit *io_limits, 
> Error **errp)
>          return false;
>      }
>  
> -    if (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] < 0 ||
> -        io_limits->bps[BLOCK_IO_LIMIT_WRITE] < 0 ||
> -        io_limits->bps[BLOCK_IO_LIMIT_READ] < 0 ||
> -        io_limits->iops[BLOCK_IO_LIMIT_TOTAL] < 0 ||
> -        io_limits->iops[BLOCK_IO_LIMIT_WRITE] < 0 ||
> -        io_limits->iops[BLOCK_IO_LIMIT_READ] < 0) {
> -        error_setg(errp, "bps and iops values must be 0 or greater");
> +    bps_threshold_flag  =
> +        (io_limits->bps_threshold[BLOCK_IO_LIMIT_TOTAL] != 0)
> +         && ((io_limits->bps_threshold[BLOCK_IO_LIMIT_READ] != 0)
> +         || (io_limits->bps_threshold[BLOCK_IO_LIMIT_WRITE] != 0));
> +    iops_threshold_flag =
> +        (io_limits->iops_threshold[BLOCK_IO_LIMIT_TOTAL] != 0)
> +         && ((io_limits->iops_threshold[BLOCK_IO_LIMIT_READ] != 0)
> +         || (io_limits->iops_threshold[BLOCK_IO_LIMIT_WRITE] != 0));
> +    if (bps_threshold_flag || iops_threshold_flag) {
> +        error_setg(errp, "bps_threshold(iops_threshold) and "
> +            "bps_rd_threshold/bps_wr_threshold"
> +            "(iops_rd_threshold/iops_wr_threshold) "
> +            "cannot be used at the same time");
> +        return false;
> +    }
> +
> +    if (check_io_limit(io_limits->bps[BLOCK_IO_LIMIT_TOTAL]) ||
> +        check_io_limit(io_limits->bps[BLOCK_IO_LIMIT_WRITE]) ||
> +        check_io_limit(io_limits->bps[BLOCK_IO_LIMIT_READ]) ||
> +        check_io_limit(io_limits->iops[BLOCK_IO_LIMIT_TOTAL]) ||
> +        check_io_limit(io_limits->iops[BLOCK_IO_LIMIT_WRITE]) ||
> +        check_io_limit(io_limits->iops[BLOCK_IO_LIMIT_READ])) {
> +        error_setg(errp, "bps and iops values must be %i or greater",
> +                   THROTTLE_HZ * 2);
>          return false;
>      }
>  
> @@ -497,6 +529,18 @@ DriveInfo *drive_init(QemuOpts *all_opts, 
> BlockInterfaceType block_default_type)
>                             qemu_opt_get_number(opts, "iops_rd", 0);
>      io_limits.iops[BLOCK_IO_LIMIT_WRITE] =
>                             qemu_opt_get_number(opts, "iops_wr", 0);
> +    io_limits.bps_threshold[BLOCK_IO_LIMIT_TOTAL] =
> +                           io_limits.bps[BLOCK_IO_LIMIT_TOTAL] / THROTTLE_HZ;
> +    io_limits.bps_threshold[BLOCK_IO_LIMIT_READ]  =
> +                           io_limits.bps[BLOCK_IO_LIMIT_READ] / THROTTLE_HZ;
> +    io_limits.bps_threshold[BLOCK_IO_LIMIT_WRITE] =
> +                           io_limits.bps[BLOCK_IO_LIMIT_WRITE] / THROTTLE_HZ;
> +    io_limits.iops_threshold[BLOCK_IO_LIMIT_TOTAL] =
> +                           io_limits.iops[BLOCK_IO_LIMIT_TOTAL] / 
> THROTTLE_HZ;
> +    io_limits.iops_threshold[BLOCK_IO_LIMIT_READ]  =
> +                           io_limits.iops[BLOCK_IO_LIMIT_READ] / THROTTLE_HZ;
> +    io_limits.iops_threshold[BLOCK_IO_LIMIT_WRITE] =
> +                           io_limits.iops[BLOCK_IO_LIMIT_WRITE] / 
> THROTTLE_HZ;
>  
>      if (!do_check_io_limits(&io_limits, &error)) {
>          error_report("%s", error_get_pretty(error));
> @@ -1198,6 +1242,12 @@ void qmp_block_set_io_throttle(const char *device, 
> int64_t bps, int64_t bps_rd,
>      io_limits.iops[BLOCK_IO_LIMIT_TOTAL]= iops;
>      io_limits.iops[BLOCK_IO_LIMIT_READ] = iops_rd;
>      io_limits.iops[BLOCK_IO_LIMIT_WRITE]= iops_wr;
> +    io_limits.bps_threshold[BLOCK_IO_LIMIT_TOTAL] = bps / THROTTLE_HZ;
> +    io_limits.bps_threshold[BLOCK_IO_LIMIT_READ]  = bps_rd / THROTTLE_HZ;
> +    io_limits.bps_threshold[BLOCK_IO_LIMIT_WRITE] = bps_wr / THROTTLE_HZ;
> +    io_limits.iops_threshold[BLOCK_IO_LIMIT_TOTAL] = iops / THROTTLE_HZ;
> +    io_limits.iops_threshold[BLOCK_IO_LIMIT_READ]  = iops_rd / THROTTLE_HZ;
> +    io_limits.iops_threshold[BLOCK_IO_LIMIT_WRITE] = iops_wr / THROTTLE_HZ;
>  
>      if (!do_check_io_limits(&io_limits, errp)) {
>          return;
> @@ -1209,11 +1259,10 @@ void qmp_block_set_io_throttle(const char *device, 
> int64_t bps, int64_t bps_rd,
>          bdrv_io_limits_enable(bs);
>      } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) {
>          bdrv_io_limits_disable(bs);
> -    } else {
> -        if (bs->block_timer) {
> -            qemu_mod_timer(bs->block_timer, qemu_get_clock_ns(vm_clock));
> -        }
>      }
> +
> +    /* reset leaky bucket to get the system in a known state */
> +    memset(&bs->leaky_buckets, 0, sizeof(bs->leaky_buckets));
>  }
>  
>  int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index c6ac871..e32ad1f 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -43,8 +43,9 @@
>  #define BLOCK_IO_LIMIT_WRITE    1
>  #define BLOCK_IO_LIMIT_TOTAL    2
>  
> -#define BLOCK_IO_SLICE_TIME     100000000
>  #define NANOSECONDS_PER_SECOND  1000000000.0
> +#define THROTTLE_HZ 1
> +#define BLOCK_IO_THROTTLE_PERIOD (NANOSECONDS_PER_SECOND / THROTTLE_HZ)
>  
>  #define BLOCK_OPT_SIZE              "size"
>  #define BLOCK_OPT_ENCRYPT           "encryption"
> @@ -73,11 +74,13 @@ typedef struct BdrvTrackedRequest {
>  typedef struct BlockIOLimit {
>      int64_t bps[3];
>      int64_t iops[3];
> +    int64_t bps_threshold[3];
> +    int64_t iops_threshold[3];
>  } BlockIOLimit;
>  
>  typedef struct BlockIOBaseValue {
> -    uint64_t bytes[2];
> -    uint64_t ios[2];
> +    int64_t bytes[2];
> +    double  ios[2];
>  } BlockIOBaseValue;
>  
>  struct BlockDriver {
> @@ -264,10 +267,10 @@ struct BlockDriverState {
>      unsigned int copy_on_read_in_flight;
>  
>      /* the time for latest disk I/O */
> -    int64_t slice_start;
> -    int64_t slice_end;
>      BlockIOLimit io_limits;
> -    BlockIOBaseValue slice_submitted;
> +    BlockIOBaseValue leaky_buckets;
> +    int64_t      previous_leak;
> +    bool         must_leak;
>      CoQueue      throttled_reqs;
>      QEMUTimer    *block_timer;
>      bool         io_limits_enabled;
> -- 
> 1.7.10.4
> 
> 

-- 
Fam

Re: [Qemu-devel] [PATCH V3 for-1.6 2/5] block: Modify the throttling code to implement the leaky bucket algorithm.

Reply via email to