On 1/19/21 8:56 PM, Dongdong Tao wrote:
> From: dongdong tao <dongdong....@canonical.com>
> 
> Current way to calculate the writeback rate only considered the
> dirty sectors, this usually works fine when the fragmentation
> is not high, but it will give us unreasonable small rate when
> we are under a situation that very few dirty sectors consumed
> a lot dirty buckets. In some case, the dirty bucekts can reached
> to CUTOFF_WRITEBACK_SYNC while the dirty data(sectors) not even
> reached the writeback_percent, the writeback rate will still
> be the minimum value (4k), thus it will cause all the writes to be
> stucked in a non-writeback mode because of the slow writeback.
> 
> We accelerate the rate in 3 stages with different aggressiveness,
> the first stage starts when dirty buckets percent reach above
> BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW (50), the second is
> BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID (57), the third is
> BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH (64). By default
> the first stage tries to writeback the amount of dirty data
> in one bucket (on average) in (1 / (dirty_buckets_percent - 50)) second,
> the second stage tries to writeback the amount of dirty data in one bucket
> in (1 / (dirty_buckets_percent - 57)) * 100 millisecond, the third
> stage tries to writeback the amount of dirty data in one bucket in
> (1 / (dirty_buckets_percent - 64)) millisecond. It is ok to be very
> aggressive in the last stage, as it is our last chance to pull it back.
> 
> Option writeback_consider_fragment to control whether we want
> this feature to be on or off, it's on by default.
> 
> Lastly, below is the performance data for all the testing result,
> including the data from production env:
> https://docs.google.com/document/d/
> 1AmbIEa_2MhB9bqhC3rfga9tp7n9YX9PLn0jSUxscVW0/edit?usp=sharing
> 
> Signed-off-by: dongdong tao <dongdong....@canonical.com>

Hi Dongdong,

Now overall the patch is OK for me, just minor comments  and please
check them inline.


> ---
>  drivers/md/bcache/bcache.h    |  4 ++++
>  drivers/md/bcache/sysfs.c     | 22 ++++++++++++++++++
>  drivers/md/bcache/writeback.c | 42 +++++++++++++++++++++++++++++++++++
>  drivers/md/bcache/writeback.h |  4 ++++
>  4 files changed, 72 insertions(+)
> 
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 1d57f48307e6..d7a84327b7f1 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -373,6 +373,7 @@ struct cached_dev {
>       unsigned int            partial_stripes_expensive:1;
>       unsigned int            writeback_metadata:1;
>       unsigned int            writeback_running:1;
> +     unsigned int            writeback_consider_fragment:1;
>       unsigned char           writeback_percent;
>       unsigned int            writeback_delay;
>  
> @@ -385,6 +386,9 @@ struct cached_dev {
>       unsigned int            writeback_rate_update_seconds;
>       unsigned int            writeback_rate_i_term_inverse;
>       unsigned int            writeback_rate_p_term_inverse;
> +     unsigned int            writeback_rate_fp_term_low;
> +     unsigned int            writeback_rate_fp_term_mid;
> +     unsigned int            writeback_rate_fp_term_high;
>       unsigned int            writeback_rate_minimum;
>  
>       enum stop_on_failure    stop_when_cache_set_failed;
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index 00a520c03f41..136899beadba 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -117,10 +117,14 @@ rw_attribute(writeback_running);
>  rw_attribute(writeback_percent);
>  rw_attribute(writeback_delay);
>  rw_attribute(writeback_rate);
> +rw_attribute(writeback_consider_fragment);
>  
>  rw_attribute(writeback_rate_update_seconds);
>  rw_attribute(writeback_rate_i_term_inverse);
>  rw_attribute(writeback_rate_p_term_inverse);
> +rw_attribute(writeback_rate_fp_term_low);
> +rw_attribute(writeback_rate_fp_term_mid);
> +rw_attribute(writeback_rate_fp_term_high);
>  rw_attribute(writeback_rate_minimum);
>  read_attribute(writeback_rate_debug);
>  
> @@ -195,6 +199,7 @@ SHOW(__bch_cached_dev)
>       var_printf(bypass_torture_test, "%i");
>       var_printf(writeback_metadata,  "%i");
>       var_printf(writeback_running,   "%i");
> +     var_printf(writeback_consider_fragment, "%i");
>       var_print(writeback_delay);
>       var_print(writeback_percent);
>       sysfs_hprint(writeback_rate,
> @@ -205,6 +210,9 @@ SHOW(__bch_cached_dev)
>       var_print(writeback_rate_update_seconds);
>       var_print(writeback_rate_i_term_inverse);
>       var_print(writeback_rate_p_term_inverse);
> +     var_print(writeback_rate_fp_term_low);
> +     var_print(writeback_rate_fp_term_mid);
> +     var_print(writeback_rate_fp_term_high);
>       var_print(writeback_rate_minimum);
>  
>       if (attr == &sysfs_writeback_rate_debug) {
> @@ -303,6 +311,7 @@ STORE(__cached_dev)
>       sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test);
>       sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata);
>       sysfs_strtoul_bool(writeback_running, dc->writeback_running);
> +     sysfs_strtoul_bool(writeback_consider_fragment, 
> dc->writeback_consider_fragment);
>       sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX);
>  
>       sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
> @@ -331,6 +340,15 @@ STORE(__cached_dev)
>       sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
>                           dc->writeback_rate_p_term_inverse,
>                           1, UINT_MAX);
> +     sysfs_strtoul_clamp(writeback_rate_fp_term_low,
> +                         dc->writeback_rate_fp_term_low,
> +                         1, UINT_MAX);
> +     sysfs_strtoul_clamp(writeback_rate_fp_term_mid,
> +                         dc->writeback_rate_fp_term_mid,
> +                         1, UINT_MAX);
> +     sysfs_strtoul_clamp(writeback_rate_fp_term_high,
> +                         dc->writeback_rate_fp_term_high,
> +                         1, UINT_MAX);


Here you should make sure the input value should always be in order,
  low < mid < high

Otherwise the calculation might be messed up.


>       sysfs_strtoul_clamp(writeback_rate_minimum,
>                           dc->writeback_rate_minimum,
>                           1, UINT_MAX);
> @@ -499,9 +517,13 @@ static struct attribute *bch_cached_dev_files[] = {
>       &sysfs_writeback_delay,
>       &sysfs_writeback_percent,
>       &sysfs_writeback_rate,
> +     &sysfs_writeback_consider_fragment,
>       &sysfs_writeback_rate_update_seconds,
>       &sysfs_writeback_rate_i_term_inverse,
>       &sysfs_writeback_rate_p_term_inverse,
> +     &sysfs_writeback_rate_fp_term_low,
> +     &sysfs_writeback_rate_fp_term_mid,
> +     &sysfs_writeback_rate_fp_term_high,
>       &sysfs_writeback_rate_minimum,
>       &sysfs_writeback_rate_debug,
>       &sysfs_io_errors,
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index a129e4d2707c..9d440166c6bf 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -88,6 +88,44 @@ static void __update_writeback_rate(struct cached_dev *dc)
>       int64_t integral_scaled;
>       uint32_t new_rate;
>  
> +     /*
> +      * We need to consider the number of dirty buckets as well
> +      * when calculating the proportional_scaled, Otherwise we might
> +      * have an unreasonable small writeback rate at a highly fragmented 
> situation
> +      * when very few dirty sectors consumed a lot dirty buckets, the
> +      * worst case is when dirty buckets reached cutoff_writeback_sync and
> +      * dirty data is still not even reached to writeback percent, so the 
> rate
> +      * still will be at the minimum value, which will cause the write
> +      * stuck at a non-writeback mode.
> +      */
> +     struct cache_set *c = dc->disk.c;
> +
> +     int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets;
> +
> +     if (dc->writeback_consider_fragment &&
> +             c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && 
> dirty > 0) {
> +             int64_t fragment =
> +                     div_s64((dirty_buckets *  c->cache->sb.bucket_size), 
> dirty);
> +             int64_t fp_term;
> +             int64_t fps;
> +
> +             if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) 
> {
> +                     fp_term = dc->writeback_rate_fp_term_low *
> +                     (c->gc_stats.in_use - 
> BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW);
> +             } else if (c->gc_stats.in_use <= 
> BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) {
> +                     fp_term = dc->writeback_rate_fp_term_mid *
> +                     (c->gc_stats.in_use - 
> BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID);
> +             } else {
> +                     fp_term = dc->writeback_rate_fp_term_high *
> +                     (c->gc_stats.in_use - 
> BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH);
> +             }
> +             fps = div_s64(dirty, dirty_buckets) * fp_term;
> +             if (fragment > 3 && fps > proportional_scaled) {
> +                     //Only overrite the p when fragment > 3

The above "//" style is not recommended as bcache coding style.


> +                     proportional_scaled = fps;
> +             }
> +     }
> +
>       if ((error < 0 && dc->writeback_rate_integral > 0) ||
>           (error > 0 && time_before64(local_clock(),
>                        dc->writeback_rate.next + NSEC_PER_MSEC))) {
> @@ -977,6 +1015,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>  
>       dc->writeback_metadata          = true;
>       dc->writeback_running           = false;
> +     dc->writeback_consider_fragment = true;
>       dc->writeback_percent           = 10;
>       dc->writeback_delay             = 30;
>       atomic_long_set(&dc->writeback_rate.rate, 1024);
> @@ -984,6 +1023,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>  
>       dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
>       dc->writeback_rate_p_term_inverse = 40;
> +     dc->writeback_rate_fp_term_low = 1;
> +     dc->writeback_rate_fp_term_mid = 10;
> +     dc->writeback_rate_fp_term_high = 1000;

Could you please to explain a bit how the above 3 numbers are decided ?



>       dc->writeback_rate_i_term_inverse = 10000;
>  
>       WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
> index 3f1230e22de0..02b2f9df73f6 100644
> --- a/drivers/md/bcache/writeback.h
> +++ b/drivers/md/bcache/writeback.h
> @@ -16,6 +16,10 @@
>  
>  #define BCH_AUTO_GC_DIRTY_THRESHOLD  50
>  
> +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW 50
> +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID 57
> +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH 64
> +
>  #define BCH_DIRTY_INIT_THRD_MAX      64
>  /*
>   * 14 (16384ths) is chosen here as something that each backing device
> 

Thanks in advance.

Coly Li

Reply via email to