This is the core of the fsio-throttle controller: it defines the
interface to the cgroup subsystem and implements the I/O measurement and
throttling logic.

Signed-off-by: Andrea Righi <righi.and...@gmail.com>
---
 include/linux/cgroup_subsys.h |   4 +
 include/linux/fsio-throttle.h |  43 +++
 init/Kconfig                  |  11 +
 kernel/cgroup/Makefile        |   1 +
 kernel/cgroup/fsio-throttle.c | 501 ++++++++++++++++++++++++++++++++++
 5 files changed, 560 insertions(+)
 create mode 100644 include/linux/fsio-throttle.h
 create mode 100644 kernel/cgroup/fsio-throttle.c

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcff3b4..33beb70c0eca 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -61,6 +61,10 @@ SUBSYS(pids)
 SUBSYS(rdma)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_FSIO_THROTTLE)
+SUBSYS(fsio)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/include/linux/fsio-throttle.h b/include/linux/fsio-throttle.h
new file mode 100644
index 000000000000..3a46df712475
--- /dev/null
+++ b/include/linux/fsio-throttle.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __FSIO_THROTTLE_H__
+#define __FSIO_THROTTLE_H__
+
+#include <linux/fs.h>
+#include <linux/genhd.h>
+
+#ifdef CONFIG_BLOCK
+static inline dev_t bdev_to_dev(struct block_device *bdev)
+{
+       return bdev ? MKDEV(MAJOR(bdev->bd_inode->i_rdev),
+                           bdev->bd_disk->first_minor) : 0;
+}
+
+static inline struct block_device *as_to_bdev(struct address_space *mapping)
+{
+       return (mapping->host && mapping->host->i_sb->s_bdev) ?
+               mapping->host->i_sb->s_bdev : NULL;
+}
+#else /* CONFIG_BLOCK */
+static dev_t bdev_to_dev(struct block_device *bdev)
+{
+       return 0;
+}
+
+static inline struct block_device *as_to_bdev(struct address_space *mapping)
+{
+       return NULL;
+}
+#endif /* CONFIG_BLOCK */
+
+#ifdef CONFIG_CGROUP_FSIO_THROTTLE
+int fsio_throttle(dev_t dev, ssize_t bytes, int state);
+#else /* CONFIG_CGROUP_FSIO_THROTTLE */
+static inline int
+fsio_throttle(dev_t dev, ssize_t bytes, int state)
+{
+       return 0;
+}
+#endif /* CONFIG_CGROUP_FSIO_THROTTLE */
+
+#endif /* __FSIO_THROTTLE_H__ */
diff --git a/init/Kconfig b/init/Kconfig
index d47cb77a220e..95d7342801eb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -775,6 +775,17 @@ config CGROUP_WRITEBACK
        depends on MEMCG && BLK_CGROUP
        default y
 
+config CGROUP_FSIO_THROTTLE
+       bool "Filesystem I/O throttling controller"
+       default n
+       depends on BLOCK
+       help
+         This option enables filesystem I/O throttling infrastructure.
+
+         This allows to properly throttle reads and writes at the filesystem
+         level, without introducing I/O locking contentions or priority
+         inversion problems.
+
 menuconfig CGROUP_SCHED
        bool "CPU controller"
        default n
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index bfcdae896122..12de828b36cd 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -2,6 +2,7 @@
 obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
+obj-$(CONFIG_CGROUP_FSIO_THROTTLE) += fsio-throttle.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/fsio-throttle.c b/kernel/cgroup/fsio-throttle.c
new file mode 100644
index 000000000000..46f3ffd4015b
--- /dev/null
+++ b/kernel/cgroup/fsio-throttle.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * fsio-throttle.c - I/O cgroup controller
+ *
+ * Copyright (C) 2019 Andrea Righi <righi.and...@gmail.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/moduleparam.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/signal.h>
+#include <linux/cgroup.h>
+#include <linux/fsio-throttle.h>
+
+#define KB(x)   ((x) * 1024)
+#define MB(x)   (KB(KB(x)))
+#define GB(x)   (MB(KB(x)))
+
+static int throttle_kernel_threads __read_mostly;
+module_param(throttle_kernel_threads, int, 0644);
+MODULE_PARM_DESC(throttle_kernel_threads,
+                 "enable/disable I/O throttling for kernel threads");
+
+static int throttle_timeslice_ms __read_mostly = 250;
+module_param(throttle_timeslice_ms, int, 0644);
+MODULE_PARM_DESC(throttle_kernel_threads,
+                 "throttling time slice (default 250ms)");
+
+static int throttle_timeframe_ms __read_mostly = 2000;
+module_param(throttle_timeframe_ms, int, 0644);
+MODULE_PARM_DESC(throttle_kernel_threads,
+                 "maximum sleep time enforced (default 2000ms)");
+
+struct iothrottle {
+       struct cgroup_subsys_state css;
+       struct list_head list;
+       /* protect the list of iothrottle_node elements (list) */
+       struct mutex lock;
+       wait_queue_head_t wait;
+       struct timer_list timer;
+       bool timer_cancel;
+       /* protect the wait queue elements */
+       spinlock_t wait_lock;
+};
+
+struct iothrottle_limit {
+       unsigned long long usage;
+       unsigned long long bucket_size;
+       unsigned long long limit;
+       unsigned long long timestamp;
+       /* protect all of the above */
+       spinlock_t lock;
+};
+
+struct iothrottle_node {
+       struct list_head node;
+       struct rcu_head rcu;
+       struct iothrottle_limit bw;
+       dev_t dev;
+};
+
+static inline bool iothrottle_disabled(void)
+{
+       return !cgroup_subsys_enabled(fsio_cgrp_subsys);
+}
+
+static struct iothrottle *css_to_iothrottle(struct cgroup_subsys_state *css)
+{
+       return css ? container_of(css, struct iothrottle, css) : NULL;
+}
+
+struct iothrottle *task_to_iothrottle(struct task_struct *p)
+{
+       if (unlikely(!p))
+               return NULL;
+       return css_to_iothrottle(task_css(p, fsio_cgrp_id));
+}
+
+static inline unsigned long long
+iothrottle_limit_delta_t(struct iothrottle_limit *res)
+{
+       return (long long)get_jiffies_64() - (long long)res->timestamp;
+}
+
+static void iothrottle_limit_init(struct iothrottle_limit *res,
+                                unsigned long long limit,
+                                unsigned long long bucket_size)
+{
+       spin_lock_init(&res->lock);
+       res->limit = limit;
+       res->usage = 0;
+       res->bucket_size = bucket_size;
+       res->timestamp = get_jiffies_64();
+}
+
+static unsigned long long
+iothrottle_limit_sleep(struct iothrottle_limit *res, unsigned long long size)
+{
+       unsigned long long delta;
+       long long tok;
+       unsigned long flags;
+
+       spin_lock_irqsave(&res->lock, flags);
+       res->usage -= size;
+       delta = jiffies_to_msecs(iothrottle_limit_delta_t(res));
+       res->timestamp = get_jiffies_64();
+       tok = (long long)res->usage * MSEC_PER_SEC;
+       if (delta) {
+               long long max = (long long)res->bucket_size * MSEC_PER_SEC;
+
+               tok += delta * res->limit;
+               tok = min_t(long long, tok, max);
+               res->usage = (unsigned long long)div_s64(tok, MSEC_PER_SEC);
+       }
+       spin_unlock_irqrestore(&res->lock, flags);
+
+       return (tok < 0) ? msecs_to_jiffies(div_u64(-tok, res->limit)) : 0;
+}
+
+static void iothrottle_limit_reset(struct iothrottle_limit *res)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&res->lock, flags);
+       res->usage = 0;
+       spin_unlock_irqrestore(&res->lock, flags);
+}
+
+static inline int iothrottle_node_size(void)
+{
+       return sizeof(struct iothrottle_node);
+}
+
+static struct iothrottle_node *iothrottle_node_alloc(gfp_t flags)
+{
+       struct iothrottle_node *n;
+       int size = iothrottle_node_size();
+
+       if (size < PAGE_SIZE)
+               n = kmalloc(size, flags);
+       else
+               n = vmalloc(size);
+       if (n)
+               memset(n, 0, size);
+       return n;
+}
+
+static void iothrottle_node_free(struct iothrottle_node *n)
+{
+       if (iothrottle_node_size() < PAGE_SIZE)
+               kfree(n);
+       else
+               vfree(n);
+}
+
+static struct iothrottle_node *
+iothrottle_node_search(const struct iothrottle *iot, dev_t dev)
+{
+       struct iothrottle_node *n;
+
+       list_for_each_entry_rcu(n, &iot->list, node)
+               if (n->dev == dev)
+                       return n;
+       return NULL;
+}
+
+static void iothrottle_node_reclaim(struct rcu_head *rp)
+{
+       struct iothrottle_node *n;
+
+       n = container_of(rp, struct iothrottle_node, rcu);
+       iothrottle_node_free(n);
+}
+
+static int iothrottle_parse_args(char *buf, size_t nbytes,
+                                dev_t *dev,
+                                unsigned long long *io_limit,
+                                unsigned long long *bucket_size)
+{
+       struct gendisk *disk;
+       unsigned int major, minor;
+       unsigned long long limit, size;
+       int part, ret = 0;
+
+       if (sscanf(buf, "%u:%u %llu %llu", &major, &minor, &limit, &size) != 4)
+               return -EINVAL;
+       disk = get_gendisk(MKDEV(major, minor), &part);
+       if (!disk)
+               return -ENODEV;
+       if (part) {
+               ret = -ENODEV;
+               goto out;
+       }
+       *dev = MKDEV(major, minor);
+       *io_limit = MB(limit);
+       *bucket_size = MB(size);
+out:
+       put_disk_and_module(disk);
+
+       return ret;
+}
+
+static ssize_t iothrottle_write(struct kernfs_open_file *of,
+                               char *buffer, size_t nbytes, loff_t off)
+{
+       struct iothrottle *iot;
+       struct iothrottle_node *n, *newn = NULL;
+       unsigned long long io_limit, bucket_size;
+       dev_t dev;
+       char *buf;
+       int ret;
+
+       /*
+        * We need to allocate a new buffer here, because
+        * iothrottle_parse_args() can modify it and the buffer provided by
+        * write_string is supposed to be const.
+        */
+       buf = kmalloc(nbytes + 1, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+       memcpy(buf, buffer, nbytes + 1);
+
+       ret = iothrottle_parse_args(buf, nbytes, &dev, &io_limit, &bucket_size);
+       if (ret)
+               goto out_free;
+
+       newn = iothrottle_node_alloc(GFP_KERNEL);
+       if (!newn) {
+               ret = -ENOMEM;
+               goto out_free;
+       }
+       newn->dev = dev;
+       iothrottle_limit_init(&newn->bw, io_limit, bucket_size);
+
+       iot = css_to_iothrottle(of_css(of));
+       if (unlikely(!iot)) {
+               WARN_ON_ONCE(1);
+               goto out_free;
+       }
+       mutex_lock(&iot->lock);
+       n = iothrottle_node_search(iot, dev);
+       if (!n) {
+               /* Insert new node */
+               if (io_limit) {
+                       list_add_rcu(&newn->node, &iot->list);
+                       newn = NULL;
+               }
+       } else if (!io_limit) {
+               /* Delete existing node */
+               list_del_rcu(&n->node);
+       } else {
+               /* Update existing node */
+               list_replace_rcu(&n->node, &newn->node);
+               newn = NULL;
+       }
+       mutex_unlock(&iot->lock);
+       if (n)
+               call_rcu(&n->rcu, iothrottle_node_reclaim);
+       ret = nbytes;
+out_free:
+       if (newn)
+               iothrottle_node_free(newn);
+       kfree(buf);
+       return ret;
+}
+
+static void iothrottle_show_limit(struct seq_file *m,
+                                 dev_t dev, struct iothrottle_limit *res)
+{
+       seq_put_decimal_ull(m, "", MAJOR(dev));
+       seq_put_decimal_ull(m, ":", MINOR(dev));
+       seq_put_decimal_ull(m, " ", res->limit);
+       seq_put_decimal_ull(m, " ", res->usage);
+       seq_put_decimal_ull(m, " ", res->bucket_size);
+       seq_put_decimal_ull(m, " ",
+                           jiffies_to_clock_t(iothrottle_limit_delta_t(res)));
+       seq_putc(m, '\n');
+}
+
+static int iothrottle_read(struct seq_file *m, void *v)
+{
+       struct iothrottle *iot = css_to_iothrottle(seq_css(m));
+       struct iothrottle_node *n;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(n, &iot->list, node)
+               iothrottle_show_limit(m, n->dev, &n->bw);
+       rcu_read_unlock();
+
+       return 0;
+}
+
+static struct cftype iothrottle_files[] = {
+       {
+               .name = "max_mbs",
+               .seq_show = iothrottle_read,
+               .write = iothrottle_write,
+               .flags = CFTYPE_NOT_ON_ROOT,
+       },
+};
+
+static void iothrottle_wakeup(struct iothrottle *iot, bool timer_cancel)
+{
+       spin_lock_bh(&iot->wait_lock);
+       if (timer_cancel)
+               iot->timer_cancel = true;
+       wake_up_all(&iot->wait);
+       spin_unlock_bh(&iot->wait_lock);
+}
+
+static void iothrottle_timer_wakeup(struct timer_list *t)
+{
+       struct iothrottle *iot = from_timer(iot, t, timer);
+
+       iothrottle_wakeup(iot, false);
+}
+
+static struct cgroup_subsys_state *
+iothrottle_css_alloc(struct cgroup_subsys_state *parent)
+{
+       struct iothrottle *iot;
+
+       iot = kzalloc(sizeof(*iot), GFP_KERNEL);
+       if (!iot)
+               return ERR_PTR(-ENOMEM);
+       INIT_LIST_HEAD(&iot->list);
+       mutex_init(&iot->lock);
+       init_waitqueue_head(&iot->wait);
+       spin_lock_init(&iot->wait_lock);
+       iot->timer_cancel = false;
+       timer_setup(&iot->timer, iothrottle_timer_wakeup, 0);
+
+       return &iot->css;
+}
+
+static void iothrottle_css_offline(struct cgroup_subsys_state *css)
+{
+       struct iothrottle *iot = css_to_iothrottle(css);
+
+       spin_lock_bh(&iot->wait_lock);
+       iot->timer_cancel = true;
+       spin_unlock_bh(&iot->wait_lock);
+
+       iothrottle_wakeup(iot, true);
+}
+
+static void iothrottle_css_free(struct cgroup_subsys_state *css)
+{
+       struct iothrottle_node *n, *p;
+       struct iothrottle *iot = css_to_iothrottle(css);
+
+       del_timer_sync(&iot->timer);
+       /*
+        * don't worry about locking here, at this point there's no reference
+        * to the list.
+        */
+       list_for_each_entry_safe(n, p, &iot->list, node)
+               iothrottle_node_free(n);
+       kfree(iot);
+}
+
+static inline bool is_kernel_thread(void)
+{
+       return !!(current->flags & (PF_KTHREAD | PF_KSWAPD));
+}
+
+static inline bool is_urgent_task(void)
+{
+       /* Never throttle tasks that are going to exit */
+       if (current->flags & PF_EXITING)
+               return true;
+       /* Throttle kernel threads only if throttle_kernel_threads is set */
+       return is_kernel_thread() && !throttle_kernel_threads;
+}
+
+static struct iothrottle *try_get_iothrottle_from_task(struct task_struct *p)
+{
+       struct iothrottle *iot = NULL;
+
+       rcu_read_lock();
+       if (!task_css_is_root(p, fsio_cgrp_id)) {
+               do {
+                       iot = task_to_iothrottle(p);
+                       if (unlikely(!iot))
+                               break;
+               } while (!css_tryget_online(&iot->css));
+       }
+       rcu_read_unlock();
+
+       return iot;
+}
+
+static int iothrottle_evaluate_sleep(struct iothrottle *iot, dev_t dev,
+                                    ssize_t bytes, int state)
+{
+       struct iothrottle_node *n;
+       unsigned long long sleep = 0;
+
+       rcu_read_lock();
+       n = iothrottle_node_search(iot, dev);
+       if (n) {
+               sleep = iothrottle_limit_sleep(&n->bw, bytes);
+               /*
+                * state == 0 is used to do only I/O accounting without
+                * enforcing sleeps.
+                */
+               if (!state || sleep < msecs_to_jiffies(throttle_timeslice_ms))
+                       sleep = 0;
+               if (sleep)
+                       iothrottle_limit_reset(&n->bw);
+       }
+       rcu_read_unlock();
+
+       return sleep;
+}
+
+static noinline void iothrottle_force_sleep(struct iothrottle *iot,
+                                           unsigned long long sleep,
+                                           int state)
+{
+       unsigned long expire, now;
+
+       /*
+        * Allow small IO bursts, by waking up the throttled task after a
+        * maximum sleep of throttle_timeframe millisec.
+        */
+       if (sleep > msecs_to_jiffies(throttle_timeframe_ms))
+               sleep = msecs_to_jiffies(throttle_timeframe_ms);
+
+       now = READ_ONCE(jiffies);
+       expire = now + sleep;
+
+       /*
+        * Round up the time to sleep to a multiple of the sleep timeslice.
+        *
+        * In this way we can strongly reduce timer softirqs and
+        * context switches in the system even when there are a lot of
+        * different cgroups.
+        */
+       expire = roundup(expire, msecs_to_jiffies(throttle_timeslice_ms));
+
+       /* Force sleep */
+       do {
+               DEFINE_WAIT(wait);
+
+               spin_lock_bh(&iot->wait_lock);
+               if (unlikely(iot->timer_cancel)) {
+                       spin_unlock_bh(&iot->wait_lock);
+                       break;
+               }
+               mod_timer(&iot->timer, expire);
+               spin_unlock_bh(&iot->wait_lock);
+
+               /*
+                * Do not enforce interruptible sleep if there are pending
+                * signals, otherwise we'll end up into a busy loop.
+                */
+               if (signal_pending(current))
+                       state = TASK_KILLABLE;
+
+               /* Send to sleep */
+               prepare_to_wait(&iot->wait, &wait, state);
+               schedule();
+               finish_wait(&iot->wait, &wait);
+       } while (!fatal_signal_pending(current) &&
+                time_is_after_jiffies(expire));
+}
+
+int fsio_throttle(dev_t dev, ssize_t bytes, int state)
+{
+       struct iothrottle *iot;
+       unsigned long long sleep = 0;
+
+       if (iothrottle_disabled() || is_urgent_task())
+               return 0;
+       if (!dev)
+               return 0;
+       iot = try_get_iothrottle_from_task(current);
+       if (!iot)
+               return 0;
+       sleep = iothrottle_evaluate_sleep(iot, dev, bytes, state);
+       if (unlikely(sleep))
+               iothrottle_force_sleep(iot, sleep, state);
+       css_put(&iot->css);
+
+       return sleep;
+}
+
+struct cgroup_subsys fsio_cgrp_subsys = {
+       .css_alloc = iothrottle_css_alloc,
+       .css_free = iothrottle_css_free,
+       .css_offline = iothrottle_css_offline,
+       .dfl_cftypes = iothrottle_files,
+};
-- 
2.17.1

Reply via email to