On Tue, 07/23 18:29, Benoît Canet wrote: > This patch replace the previous algorithm by the well described leaky bucket > algorithm: A bucket is filled by the incoming IOs and a periodic timer > decrement > the counter to make the bucket leak. When a given threshold is reached the > bucket is full and the IOs are hold. > > In this patch the threshold is set to a default value to make the code behave > like the previous implementation. > > In the next patch the threshold will be exposed in QMP to let the user control > the burstiness of the throttling. > > Signed-off-by: Benoit Canet <ben...@irqsave.net> > --- > block.c | 454 > +++++++++++++++++++++++++++------------------ > blockdev.c | 71 +++++-- > include/block/block_int.h | 15 +- > 3 files changed, 339 insertions(+), 201 deletions(-) > > diff --git a/block.c b/block.c > index dc72643..f1cd9c0 100644 > --- a/block.c > +++ b/block.c > @@ -86,13 +86,6 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque); > static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, > int64_t sector_num, int nb_sectors); > > -static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, > - bool is_write, double elapsed_time, uint64_t *wait); > -static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, > - double elapsed_time, uint64_t *wait); > -static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, > - bool is_write, int64_t *wait); > - > static QTAILQ_HEAD(, BlockDriverState) bdrv_states = > QTAILQ_HEAD_INITIALIZER(bdrv_states); > > @@ -101,6 +94,8 @@ static QLIST_HEAD(, BlockDriver) bdrv_drivers = > > /* If non-zero, use only whitelisted block drivers */ > static int use_bdrv_whitelist; > +/* boolean used to inform the throttling code that a bdrv_drain_all is > issued */ > +static bool draining; > > #ifdef _WIN32 > static int is_windows_drive_prefix(const char *filename) > @@ -129,28 +124,170 @@ void bdrv_io_limits_disable(BlockDriverState *bs) > > while (qemu_co_enter_next(&bs->throttled_reqs)) { > } > +} > > - if (bs->block_timer) { > - qemu_del_timer(bs->block_timer); > - qemu_free_timer(bs->block_timer); > - bs->block_timer = NULL; > +static void bdrv_make_bps_buckets_leak(BlockDriverState *bs, int64_t delta) > +{ > + int64_t *bytes = bs->leaky_buckets.bytes; > + int64_t read_leak, write_leak; > + > + /* the limit apply to both reads and writes */ > + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { > + /* compute half the total leak */ > + int64_t leak = ((bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL] * delta) / > + NANOSECONDS_PER_SECOND); > + int remain = leak % 2; > + leak /= 2; > + > + /* the read bucket is smaller than half the quantity to leak so take > + * care adding the leak difference to write leak > + */ > + if (bytes[BLOCK_IO_LIMIT_READ] <= leak) { > + read_leak = bytes[BLOCK_IO_LIMIT_READ]; > + write_leak = 2 * leak + remain - bytes[BLOCK_IO_LIMIT_READ]; > + /* symetric case */ > + } else if (bytes[BLOCK_IO_LIMIT_WRITE] <= leak) { > + write_leak = bytes[BLOCK_IO_LIMIT_WRITE]; > + read_leak = 2 * leak + remain - bytes[BLOCK_IO_LIMIT_WRITE]; > + /* both bucket above leak count use half the total leak for both */ > + } else { > + write_leak = leak; > + read_leak = leak + remain; > + } > + /* else we consider that limits are separated */ > + } else { > + read_leak = (bs->io_limits.bps[BLOCK_IO_LIMIT_READ] * delta) / > + NANOSECONDS_PER_SECOND; > + write_leak = (bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE] * delta) / > + NANOSECONDS_PER_SECOND; > + } > + > + /* make the buckets leak */ > + bytes[BLOCK_IO_LIMIT_READ] = MAX(bytes[BLOCK_IO_LIMIT_READ] - read_leak, > + 0); > + bytes[BLOCK_IO_LIMIT_WRITE] = MAX(bytes[BLOCK_IO_LIMIT_WRITE] - > write_leak, > + 0); > +} > + > +static void bdrv_make_iops_buckets_leak(BlockDriverState *bs, int64_t delta) > +{ > + double *ios = bs->leaky_buckets.ios; > + int64_t read_leak, write_leak; > + > + /* the limit apply to both reads and writes */ > + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > + /* compute half the total leak */ > + int64_t leak = ((bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL] * delta) / > + NANOSECONDS_PER_SECOND);
You have the total leak here... > + int remain = leak % 2; > + leak /= 2; > + > + /* the read bucket is smaller than half the quantity to leak so take > + * care adding the leak difference to write leak > + */ > + if (ios[BLOCK_IO_LIMIT_READ] <= leak) { > + read_leak = ios[BLOCK_IO_LIMIT_READ]; > + write_leak = 2 * leak + remain - ios[BLOCK_IO_LIMIT_READ]; > + /* symetric case */ > + } else if (ios[BLOCK_IO_LIMIT_WRITE] <= leak) { > + write_leak = ios[BLOCK_IO_LIMIT_WRITE]; > + read_leak = 2 * leak + remain - ios[BLOCK_IO_LIMIT_WRITE]; > + /* both bucket above leak count use half the total leak for both */ > + } else { > + write_leak = leak; > + read_leak = leak + remain; > + } I think it is easier to understand written like this: int64_t total_leak = ((bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL] * delta) / NANOSECONDS_PER_SECOND); if (ios[BLOCK_IO_LIMIT_READ] <= total_leak / 2) { read_leak = ios[BLOCK_IO_LIMIT_READ]; write_leak = total_leak - read_leak; /* symetric case */ } else if (ios[BLOCK_IO_LIMIT_WRITE] <= total_leak / 2) { write_leak = ios[BLOCK_IO_LIMIT_WRITE]; read_leak = total_leak - write_leak; /* both bucket above leak count use half the total leak for both */ } else { write_leak = total_leak / 2; read_leak = (total_leak + 1) / 2; } > + /* else we consider that limits are separated */ > + } else { > + read_leak = (bs->io_limits.iops[BLOCK_IO_LIMIT_READ] * delta) / > + NANOSECONDS_PER_SECOND; > + write_leak = (bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE] * delta) / > + NANOSECONDS_PER_SECOND; > + } > + > + /* make the buckets leak */ > + ios[BLOCK_IO_LIMIT_READ] = MAX(ios[BLOCK_IO_LIMIT_READ] - read_leak, 0); > + ios[BLOCK_IO_LIMIT_WRITE] = MAX(ios[BLOCK_IO_LIMIT_WRITE] - write_leak, > 0); > +} > + > +static void bdrv_leak_if_needed(BlockDriverState *bs) > +{ > + int64_t now; > + int64_t delta; > + > + if (!bs->must_leak) { > + return; > + } > + > + bs->must_leak = false; > + > + now = qemu_get_clock_ns(rt_clock); > + delta = now - bs->previous_leak; > + bs->previous_leak = now; > + > + bdrv_make_bps_buckets_leak(bs, delta); > + bdrv_make_iops_buckets_leak(bs, delta); > +} > + > +static void bdrv_block_timer_disable(BlockDriverState *bs) > +{ > + if (!bs->block_timer) { > + return; > } > > - bs->slice_start = 0; > - bs->slice_end = 0; > + qemu_del_timer(bs->block_timer); > + qemu_free_timer(bs->block_timer); > + bs->block_timer = NULL; > +} > + > +static bool bdrv_throttling_is_iddle(BlockDriverState *bs) I don't quite understad the wording here, is iddle equivalent to idle? > +{ > + int64_t delta = qemu_get_clock_ns(rt_clock) - bs->previous_leak; > + > + if (delta < BLOCK_IO_THROTTLE_PERIOD * 2) { > + return false; > + } > + > + /* iddle */ > + return true; > } > > +/* This callback is the timer in charge of making the leaky buckets leak */ > static void bdrv_block_timer(void *opaque) Will be more readable for me if you could rename it to bdrv_clock_timer_cb. > { > BlockDriverState *bs = opaque; > > + /* disable throttling time on iddle for economy purpose */ > + if (bdrv_throttling_is_iddle(bs)) { > + bdrv_block_timer_disable(bs); > + return; > + } > + > + /* rearm the timer */ > + qemu_mod_timer(bs->block_timer, > + qemu_get_clock_ns(vm_clock) + > + BLOCK_IO_THROTTLE_PERIOD); > + > + bs->must_leak = true; > qemu_co_enter_next(&bs->throttled_reqs); > } > > +static void bdrv_block_timer_enable(BlockDriverState *bs) > +{ > + if (bs->block_timer) { > + return; > + } > + > + bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); > + bs->previous_leak = qemu_get_clock_ns(rt_clock); > + qemu_mod_timer(bs->block_timer, > + qemu_get_clock_ns(vm_clock) + > + BLOCK_IO_THROTTLE_PERIOD); > +} > + > void bdrv_io_limits_enable(BlockDriverState *bs) > { > qemu_co_queue_init(&bs->throttled_reqs); > - bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); > bs->io_limits_enabled = true; > } > > @@ -165,15 +302,118 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs) > || io_limits->iops[BLOCK_IO_LIMIT_TOTAL]; > } > > +/* This function check if the correct bandwith threshold has been exceeded What does the "correct bandwidth threshold" mean? And s/bandwith/bandwidth/, series wide. > + * > + * @is_write: true if the current IO is a write, false if it's a read > + * @ret: true if threshold has been exceeded else false > + */ > +static bool bdrv_is_bps_threshold_exceeded(BlockDriverState *bs, bool > is_write) > +{ > + /* limit is on total read + write bps : do the sum and compare with total > + * threshold > + */ > + if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { > + int64_t bytes = bs->leaky_buckets.bytes[BLOCK_IO_LIMIT_READ] + > + bs->leaky_buckets.bytes[BLOCK_IO_LIMIT_WRITE]; > + return bs->io_limits.bps_threshold[BLOCK_IO_LIMIT_TOTAL] < bytes; > + } > + > + /* check wether the threshold corresponding to the current io type (read, > + * write) has been exceeded > + */ > + if (bs->io_limits.bps[is_write]) { It looks dangerous to use is_write as index of the array. > + return bs->io_limits.bps_threshold[is_write] < > + bs->leaky_buckets.bytes[is_write]; > + } > + > + /* no limit */ > + return false; > +} > + > +/* This function check if the correct iops threshold has been exceeded > + * > + * @is_write: true if the current IO is a write, false if it's a read > + * @ret: true if threshold has been exceeded else false > + */ > +static bool bdrv_is_iops_threshold_exceeded(BlockDriverState *bs, bool > is_write) > +{ > + /* limit is on total read + write iops : do the sum and compare with > total > + * threshold > + */ > + if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > + double ios = bs->leaky_buckets.ios[BLOCK_IO_LIMIT_READ] + > + bs->leaky_buckets.ios[BLOCK_IO_LIMIT_WRITE]; > + return bs->io_limits.iops_threshold[BLOCK_IO_LIMIT_TOTAL] < ios; > + } > + > + /* check wether the threshold corresponding to the current io type (read, > + * write) has been exceeded > + */ > + if (bs->io_limits.iops[is_write]) { > + return bs->io_limits.iops_threshold[is_write] < > + bs->leaky_buckets.ios[is_write]; > + } > + > + /* no limit */ > + return false; > +} > + > +/* This function check if any bandwith or iops threshold has been exceeded > + * > + * @nb_sectors: the number of sectors of the current IO > + * @is_write: true if the current IO is a write, false if it's a read > + * @ret: true if any threshold has been exceeded else false > + */ > +static bool bdrv_is_any_threshold_exceeded(BlockDriverState *bs, int > nb_sectors, > + bool is_write) > +{ > + bool bps_ret, iops_ret; > + > + /* check if any bandwith or per IO threshold has been exceeded */ > + bps_ret = bdrv_is_bps_threshold_exceeded(bs, is_write); > + iops_ret = bdrv_is_iops_threshold_exceeded(bs, is_write); > + > + /* if so the IO will be blocked so do not account it and return true > + * also return false if a bdrv_drain_all is in progress > + */ > + if (!draining && (bps_ret || iops_ret)) { > + return true; > + } > + > + /* NOTE: the counter can go above the threshold when authorizing an IO. > + * At next call the code will punish the guest by blocking the > + * next IO until the counter has been decremented below the > threshold. > + * This way if a guest issue a jumbo IO bigger than the threshold > it > + * will have a chance no be authorized and will not result in a > guest > + * IO deadlock. > + */ > + > + /* the IO is authorized so do the accounting and return false */ > + bs->leaky_buckets.bytes[is_write] += (int64_t)nb_sectors * > + BDRV_SECTOR_SIZE; > + bs->leaky_buckets.ios[is_write]++; > + > + return false; > +} > + > static void bdrv_io_limits_intercept(BlockDriverState *bs, > bool is_write, int nb_sectors) > { > - int64_t wait_time = -1; > + /* enable block timer if needed when intercepting I/Os */ > + if (!bs->block_timer) { Already checking for bs->block_timer in bdrv_block_timer_enable(). > + bdrv_block_timer_enable(bs); > + } > > + bdrv_leak_if_needed(bs); > + /* if some IOs are already queued because the bucket is full put the > current > + * IO at the end of the queue (FIFO) > + */ > if (!qemu_co_queue_empty(&bs->throttled_reqs)) { > qemu_co_queue_wait(&bs->throttled_reqs); > } > > + bdrv_leak_if_needed(bs); > + > /* In fact, we hope to keep each request's timing, in FIFO mode. The next > * throttled requests will not be dequeued until the current request is > * allowed to be serviced. So if the current request still exceeds the > @@ -181,13 +421,19 @@ static void bdrv_io_limits_intercept(BlockDriverState > *bs, > * be still in throttled_reqs queue. > */ > > - while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) { > - qemu_mod_timer(bs->block_timer, > - wait_time + qemu_get_clock_ns(vm_clock)); > + /* if a threshold is exceeded the leaky bucket is full so the code put > the > + * IO in the throttle_reqs queue until the bucket has leaked enough to be > + * not full > + */ > + while (bdrv_is_any_threshold_exceeded(bs, nb_sectors, is_write)) { > + bdrv_leak_if_needed(bs); > qemu_co_queue_wait_insert_head(&bs->throttled_reqs); > + bdrv_leak_if_needed(bs); > } > > + bdrv_leak_if_needed(bs); > qemu_co_queue_next(&bs->throttled_reqs); > + bdrv_leak_if_needed(bs); > } > > /* check if the path starts with "<protocol>:" */ > @@ -1439,6 +1685,9 @@ void bdrv_drain_all(void) > BlockDriverState *bs; > bool busy; > > + /* tell the throttling code we are draining */ > + draining = true; > + > do { > busy = qemu_aio_wait(); > > @@ -1457,6 +1706,8 @@ void bdrv_drain_all(void) > assert(QLIST_EMPTY(&bs->tracked_requests)); > assert(qemu_co_queue_empty(&bs->throttled_reqs)); > } > + > + draining = false; > } > > /* make a BlockDriverState anonymous by removing from bdrv_state list. > @@ -1492,9 +1743,7 @@ static void bdrv_move_feature_fields(BlockDriverState > *bs_dest, > bs_dest->enable_write_cache = bs_src->enable_write_cache; > > /* i/o timing parameters */ > - bs_dest->slice_start = bs_src->slice_start; > - bs_dest->slice_end = bs_src->slice_end; > - bs_dest->slice_submitted = bs_src->slice_submitted; > + bs_dest->leaky_buckets = bs_src->leaky_buckets; > bs_dest->io_limits = bs_src->io_limits; > bs_dest->throttled_reqs = bs_src->throttled_reqs; > bs_dest->block_timer = bs_src->block_timer; > @@ -3551,169 +3800,6 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb) > acb->aiocb_info->cancel(acb); > } > > -/* block I/O throttling */ > -static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, > - bool is_write, double elapsed_time, uint64_t *wait) > -{ > - uint64_t bps_limit = 0; > - uint64_t extension; > - double bytes_limit, bytes_base, bytes_res; > - double slice_time, wait_time; > - > - if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { > - bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; > - } else if (bs->io_limits.bps[is_write]) { > - bps_limit = bs->io_limits.bps[is_write]; > - } else { > - if (wait) { > - *wait = 0; > - } > - > - return false; > - } > - > - slice_time = bs->slice_end - bs->slice_start; > - slice_time /= (NANOSECONDS_PER_SECOND); > - bytes_limit = bps_limit * slice_time; > - bytes_base = bs->slice_submitted.bytes[is_write]; > - if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { > - bytes_base += bs->slice_submitted.bytes[!is_write]; > - } > - > - /* bytes_base: the bytes of data which have been read/written; and > - * it is obtained from the history statistic info. > - * bytes_res: the remaining bytes of data which need to be read/written. > - * (bytes_base + bytes_res) / bps_limit: used to calcuate > - * the total time for completing reading/writting all data. > - */ > - bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; > - > - if (bytes_base + bytes_res <= bytes_limit) { > - if (wait) { > - *wait = 0; > - } > - > - return false; > - } > - > - /* Calc approx time to dispatch */ > - wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time; > - > - /* When the I/O rate at runtime exceeds the limits, > - * bs->slice_end need to be extended in order that the current statistic > - * info can be kept until the timer fire, so it is increased and tuned > - * based on the result of experiment. > - */ > - extension = wait_time * NANOSECONDS_PER_SECOND; > - extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) * > - BLOCK_IO_SLICE_TIME; > - bs->slice_end += extension; > - if (wait) { > - *wait = wait_time * NANOSECONDS_PER_SECOND; > - } > - > - return true; > -} > - > -static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, > - double elapsed_time, uint64_t *wait) > -{ > - uint64_t iops_limit = 0; > - double ios_limit, ios_base; > - double slice_time, wait_time; > - > - if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > - iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; > - } else if (bs->io_limits.iops[is_write]) { > - iops_limit = bs->io_limits.iops[is_write]; > - } else { > - if (wait) { > - *wait = 0; > - } > - > - return false; > - } > - > - slice_time = bs->slice_end - bs->slice_start; > - slice_time /= (NANOSECONDS_PER_SECOND); > - ios_limit = iops_limit * slice_time; > - ios_base = bs->slice_submitted.ios[is_write]; > - if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { > - ios_base += bs->slice_submitted.ios[!is_write]; > - } > - > - if (ios_base + 1 <= ios_limit) { > - if (wait) { > - *wait = 0; > - } > - > - return false; > - } > - > - /* Calc approx time to dispatch, in seconds */ > - wait_time = (ios_base + 1) / iops_limit; > - if (wait_time > elapsed_time) { > - wait_time = wait_time - elapsed_time; > - } else { > - wait_time = 0; > - } > - > - /* Exceeded current slice, extend it by another slice time */ > - bs->slice_end += BLOCK_IO_SLICE_TIME; > - if (wait) { > - *wait = wait_time * NANOSECONDS_PER_SECOND; > - } > - > - return true; > -} > - > -static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, > - bool is_write, int64_t *wait) > -{ > - int64_t now, max_wait; > - uint64_t bps_wait = 0, iops_wait = 0; > - double elapsed_time; > - int bps_ret, iops_ret; > - > - now = qemu_get_clock_ns(vm_clock); > - if (now > bs->slice_end) { > - bs->slice_start = now; > - bs->slice_end = now + BLOCK_IO_SLICE_TIME; > - memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted)); > - } > - > - elapsed_time = now - bs->slice_start; > - elapsed_time /= (NANOSECONDS_PER_SECOND); > - > - bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors, > - is_write, elapsed_time, &bps_wait); > - iops_ret = bdrv_exceed_iops_limits(bs, is_write, > - elapsed_time, &iops_wait); > - if (bps_ret || iops_ret) { > - max_wait = bps_wait > iops_wait ? bps_wait : iops_wait; > - if (wait) { > - *wait = max_wait; > - } > - > - now = qemu_get_clock_ns(vm_clock); > - if (bs->slice_end < now + max_wait) { > - bs->slice_end = now + max_wait; > - } > - > - return true; > - } > - > - if (wait) { > - *wait = 0; > - } > - > - bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors * > - BDRV_SECTOR_SIZE; > - bs->slice_submitted.ios[is_write]++; > - > - return false; > -} > - > /**************************************************************/ > /* async block device emulation */ > > diff --git a/blockdev.c b/blockdev.c > index c5abd65..491e4d0 100644 > --- a/blockdev.c > +++ b/blockdev.c > @@ -280,10 +280,25 @@ static int parse_block_error_action(const char *buf, > bool is_read) > } > } > > +static bool check_io_limit(int64_t limit) > +{ > + if (!limit) { > + return false; > + } > + > + if (limit < (THROTTLE_HZ * 2)) { > + return true; > + } > + > + return false; > +} > + > static bool do_check_io_limits(BlockIOLimit *io_limits, Error **errp) > { > bool bps_flag; > bool iops_flag; > + bool bps_threshold_flag; > + bool iops_threshold_flag; > > assert(io_limits); > > @@ -299,13 +314,30 @@ static bool do_check_io_limits(BlockIOLimit *io_limits, > Error **errp) > return false; > } > > - if (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] < 0 || > - io_limits->bps[BLOCK_IO_LIMIT_WRITE] < 0 || > - io_limits->bps[BLOCK_IO_LIMIT_READ] < 0 || > - io_limits->iops[BLOCK_IO_LIMIT_TOTAL] < 0 || > - io_limits->iops[BLOCK_IO_LIMIT_WRITE] < 0 || > - io_limits->iops[BLOCK_IO_LIMIT_READ] < 0) { > - error_setg(errp, "bps and iops values must be 0 or greater"); > + bps_threshold_flag = > + (io_limits->bps_threshold[BLOCK_IO_LIMIT_TOTAL] != 0) > + && ((io_limits->bps_threshold[BLOCK_IO_LIMIT_READ] != 0) > + || (io_limits->bps_threshold[BLOCK_IO_LIMIT_WRITE] != 0)); > + iops_threshold_flag = > + (io_limits->iops_threshold[BLOCK_IO_LIMIT_TOTAL] != 0) > + && ((io_limits->iops_threshold[BLOCK_IO_LIMIT_READ] != 0) > + || (io_limits->iops_threshold[BLOCK_IO_LIMIT_WRITE] != 0)); > + if (bps_threshold_flag || iops_threshold_flag) { > + error_setg(errp, "bps_threshold(iops_threshold) and " > + "bps_rd_threshold/bps_wr_threshold" > + "(iops_rd_threshold/iops_wr_threshold) " > + "cannot be used at the same time"); > + return false; > + } > + > + if (check_io_limit(io_limits->bps[BLOCK_IO_LIMIT_TOTAL]) || > + check_io_limit(io_limits->bps[BLOCK_IO_LIMIT_WRITE]) || > + check_io_limit(io_limits->bps[BLOCK_IO_LIMIT_READ]) || > + check_io_limit(io_limits->iops[BLOCK_IO_LIMIT_TOTAL]) || > + check_io_limit(io_limits->iops[BLOCK_IO_LIMIT_WRITE]) || > + check_io_limit(io_limits->iops[BLOCK_IO_LIMIT_READ])) { > + error_setg(errp, "bps and iops values must be %i or greater", > + THROTTLE_HZ * 2); > return false; > } > > @@ -497,6 +529,18 @@ DriveInfo *drive_init(QemuOpts *all_opts, > BlockInterfaceType block_default_type) > qemu_opt_get_number(opts, "iops_rd", 0); > io_limits.iops[BLOCK_IO_LIMIT_WRITE] = > qemu_opt_get_number(opts, "iops_wr", 0); > + io_limits.bps_threshold[BLOCK_IO_LIMIT_TOTAL] = > + io_limits.bps[BLOCK_IO_LIMIT_TOTAL] / THROTTLE_HZ; > + io_limits.bps_threshold[BLOCK_IO_LIMIT_READ] = > + io_limits.bps[BLOCK_IO_LIMIT_READ] / THROTTLE_HZ; > + io_limits.bps_threshold[BLOCK_IO_LIMIT_WRITE] = > + io_limits.bps[BLOCK_IO_LIMIT_WRITE] / THROTTLE_HZ; > + io_limits.iops_threshold[BLOCK_IO_LIMIT_TOTAL] = > + io_limits.iops[BLOCK_IO_LIMIT_TOTAL] / > THROTTLE_HZ; > + io_limits.iops_threshold[BLOCK_IO_LIMIT_READ] = > + io_limits.iops[BLOCK_IO_LIMIT_READ] / THROTTLE_HZ; > + io_limits.iops_threshold[BLOCK_IO_LIMIT_WRITE] = > + io_limits.iops[BLOCK_IO_LIMIT_WRITE] / > THROTTLE_HZ; > > if (!do_check_io_limits(&io_limits, &error)) { > error_report("%s", error_get_pretty(error)); > @@ -1198,6 +1242,12 @@ void qmp_block_set_io_throttle(const char *device, > int64_t bps, int64_t bps_rd, > io_limits.iops[BLOCK_IO_LIMIT_TOTAL]= iops; > io_limits.iops[BLOCK_IO_LIMIT_READ] = iops_rd; > io_limits.iops[BLOCK_IO_LIMIT_WRITE]= iops_wr; > + io_limits.bps_threshold[BLOCK_IO_LIMIT_TOTAL] = bps / THROTTLE_HZ; > + io_limits.bps_threshold[BLOCK_IO_LIMIT_READ] = bps_rd / THROTTLE_HZ; > + io_limits.bps_threshold[BLOCK_IO_LIMIT_WRITE] = bps_wr / THROTTLE_HZ; > + io_limits.iops_threshold[BLOCK_IO_LIMIT_TOTAL] = iops / THROTTLE_HZ; > + io_limits.iops_threshold[BLOCK_IO_LIMIT_READ] = iops_rd / THROTTLE_HZ; > + io_limits.iops_threshold[BLOCK_IO_LIMIT_WRITE] = iops_wr / THROTTLE_HZ; > > if (!do_check_io_limits(&io_limits, errp)) { > return; > @@ -1209,11 +1259,10 @@ void qmp_block_set_io_throttle(const char *device, > int64_t bps, int64_t bps_rd, > bdrv_io_limits_enable(bs); > } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) { > bdrv_io_limits_disable(bs); > - } else { > - if (bs->block_timer) { > - qemu_mod_timer(bs->block_timer, qemu_get_clock_ns(vm_clock)); > - } > } > + > + /* reset leaky bucket to get the system in a known state */ > + memset(&bs->leaky_buckets, 0, sizeof(bs->leaky_buckets)); > } > > int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data) > diff --git a/include/block/block_int.h b/include/block/block_int.h > index c6ac871..e32ad1f 100644 > --- a/include/block/block_int.h > +++ b/include/block/block_int.h > @@ -43,8 +43,9 @@ > #define BLOCK_IO_LIMIT_WRITE 1 > #define BLOCK_IO_LIMIT_TOTAL 2 > > -#define BLOCK_IO_SLICE_TIME 100000000 > #define NANOSECONDS_PER_SECOND 1000000000.0 > +#define THROTTLE_HZ 1 > +#define BLOCK_IO_THROTTLE_PERIOD (NANOSECONDS_PER_SECOND / THROTTLE_HZ) > > #define BLOCK_OPT_SIZE "size" > #define BLOCK_OPT_ENCRYPT "encryption" > @@ -73,11 +74,13 @@ typedef struct BdrvTrackedRequest { > typedef struct BlockIOLimit { > int64_t bps[3]; > int64_t iops[3]; > + int64_t bps_threshold[3]; > + int64_t iops_threshold[3]; > } BlockIOLimit; > > typedef struct BlockIOBaseValue { > - uint64_t bytes[2]; > - uint64_t ios[2]; > + int64_t bytes[2]; > + double ios[2]; > } BlockIOBaseValue; > > struct BlockDriver { > @@ -264,10 +267,10 @@ struct BlockDriverState { > unsigned int copy_on_read_in_flight; > > /* the time for latest disk I/O */ > - int64_t slice_start; > - int64_t slice_end; > BlockIOLimit io_limits; > - BlockIOBaseValue slice_submitted; > + BlockIOBaseValue leaky_buckets; > + int64_t previous_leak; > + bool must_leak; > CoQueue throttled_reqs; > QEMUTimer *block_timer; > bool io_limits_enabled; > -- > 1.7.10.4 > > -- Fam