Allow to limit the network bandwidth for specific process containers (cgroups) imposing additional delays in the sockets' sendmsg()/recvmsg() calls made by those processes that exceed the limits defined in the control group filesystem.
Example: # mkdir /dev/cgroup # mount -t cgroup -onet net /dev/cgroup # cd /dev/cgroup # mkdir foo --> the cgroup foo has been created # /bin/echo $$ > foo/tasks # /bin/echo 1024 > foo/net.tcp # /bin/echo 2048 > foo/net.tot # sh --> the subshell 'sh' is running in cgroup "foo" that has a maximum network bandwidth for TCP traffic of 1MB/s and 2MB/s for total network activities. The netlimit approach can be easily extended to support additional network protocols or different socket families or types (PF_UNIX, PF_BLUETOOTH, SOCK_SEQPACKET, etc.). Signed-off-by: Andrea Righi <[EMAIL PROTECTED]> --- diff -urpN linux-2.6.24-rc8/include/linux/cgroup_netlimit.h linux-2.6.24-rc8-cgroup-netlimit/include/linux/cgroup_netlimit.h --- linux-2.6.24-rc8/include/linux/cgroup_netlimit.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.24-rc8-cgroup-netlimit/include/linux/cgroup_netlimit.h 2008-01-22 21:36:15.000000000 +0100 @@ -0,0 +1,29 @@ +#ifndef CGROUP_NETLIMIT_H +#define CGROUP_NETLIMIT_H + +enum { + CGROUP_NETLIMIT_TOT, + CGROUP_NETLIMIT_TCP, + CGROUP_NETLIMIT_UDP, + CGROUP_NETLIMIT_RAW, + /* This sets the size of the different netlimit types */ + CGROUP_NETLIMIT_END, +}; + +#define CGROUP_NETLIMIT_FILE(_x, _y) \ + { \ + .name = _x, \ + .read = netlimit_read, \ + .write_uint = netlimit_write_uint, \ + .private = _y, \ + } + +#ifdef CONFIG_CGROUP_NETLIMIT +extern void cgroup_nl_acct(int limit_id, size_t bytes); +extern void cgroup_nl_throttle(int limit_id, int interruptible); +#else +static inline void cgroup_nl_acct(int limit_id, size_t bytes) { } +static inline void cgroup_nl_throttle(int limit_id, int interruptible) { } +#endif /* CONFIG_CGROUP_NETLIMIT */ + +#endif diff -urpN linux-2.6.24-rc8/include/linux/cgroup_subsys.h linux-2.6.24-rc8-cgroup-netlimit/include/linux/cgroup_subsys.h --- linux-2.6.24-rc8/include/linux/cgroup_subsys.h 2008-01-16 05:22:48.000000000 +0100 +++ linux-2.6.24-rc8-cgroup-netlimit/include/linux/cgroup_subsys.h 2008-01-22 14:42:17.000000000 +0100 @@ -37,3 +37,9 @@ SUBSYS(cpuacct) /* */ +#ifdef CONFIG_CGROUP_NETLIMIT +SUBSYS(netlimit) +#endif + +/* */ + diff -urpN linux-2.6.24-rc8/init/Kconfig linux-2.6.24-rc8-cgroup-netlimit/init/Kconfig --- linux-2.6.24-rc8/init/Kconfig 2008-01-16 05:22:48.000000000 +0100 +++ linux-2.6.24-rc8-cgroup-netlimit/init/Kconfig 2008-01-23 00:44:04.000000000 +0100 @@ -313,6 +313,15 @@ config CGROUP_NS for instance virtual servers and checkpoint/restart jobs. +config CGROUP_NETLIMIT + bool "Enable cgroup network bandwidth limitinig (EXPERIMENTAL)" + depends on EXPERIMENTAL && CGROUPS + help + This allows to define network bandwidth limiting/shaping rules for + specific cgroup(s). + + Say N if unsure. + config CPUSETS bool "Cpuset support" depends on SMP && CGROUPS diff -urpN linux-2.6.24-rc8/kernel/cgroup_netlimit.c linux-2.6.24-rc8-cgroup-netlimit/kernel/cgroup_netlimit.c --- linux-2.6.24-rc8/kernel/cgroup_netlimit.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.24-rc8-cgroup-netlimit/kernel/cgroup_netlimit.c 2008-01-22 21:35:53.000000000 +0100 @@ -0,0 +1,229 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Copyright (C) 2008 Andrea Righi <[EMAIL PROTECTED]> + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/gfp.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/jiffies.h> +#include <linux/spinlock.h> +#include <linux/cgroup_netlimit.h> + +struct netlimit { + struct cgroup_subsys_state css; + spinlock_t lock[CGROUP_NETLIMIT_END]; + unsigned long bandwidth[CGROUP_NETLIMIT_END]; + unsigned long req[CGROUP_NETLIMIT_END]; + unsigned long last_request[CGROUP_NETLIMIT_END]; +}; + +static inline struct netlimit *cgroup_to_netlimit(struct cgroup *cont) +{ + return container_of(cgroup_subsys_state(cont, netlimit_subsys_id), + struct netlimit, css); +} + +static inline struct netlimit *task_to_netlimit(struct task_struct *task) +{ + return container_of(task_subsys_state(task, netlimit_subsys_id), + struct netlimit, css); +} + +/* + * Rules: you can only create a cgroup if: + * 1. you are capable(CAP_SYS_ADMIN) + * 2. the target cgroup is a descendant of your own cgroup + * + * Note: called from kernel/cgroup.c with cgroup_lock() held. + */ +static struct cgroup_subsys_state *netlimit_create( + struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct netlimit *nl; + int i; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (!cgroup_is_descendant(cont)) + return ERR_PTR(-EPERM); + + nl = kzalloc(sizeof(struct netlimit), GFP_KERNEL); + if (unlikely(!nl)) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < CGROUP_NETLIMIT_END; i++) { + spin_lock_init(&nl->lock[i]); + nl->last_request[i] = jiffies; + } + + return &nl->css; +} + +/* + * Note: called from kernel/cgroup.c with cgroup_lock() held. + */ +static void netlimit_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + kfree(cgroup_to_netlimit(cont)); +} + +static ssize_t netlimit_read(struct cgroup *cont, struct cftype *cft, + struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + ssize_t count, ret; + unsigned long delta, bandwidth, req, last_request; + struct netlimit *nl; + char *page; + + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) + return -ENOMEM; + + cgroup_lock(); + if (cgroup_is_removed(cont)) { + cgroup_unlock(); + ret = -ENODEV; + goto out; + } + + nl = cgroup_to_netlimit(cont); + spin_lock_irq(&nl->lock[cft->private]); + + delta = (long)jiffies - (long)nl->last_request[cft->private]; + bandwidth = nl->bandwidth[cft->private]; + req = nl->req[cft->private]; + last_request = nl->last_request[cft->private]; + + spin_unlock_irq(&nl->lock[cft->private]); + cgroup_unlock(); + + /* print additional debugging stuff */ + count = sprintf(page, " type: %s\n" + "max-bandwidth: %lu KB/sec\n" + " requested: %lu KB\n" + " last request: %lu jiffies\n" + " delta: %lu jiffies\n", + cft->name, + bandwidth, req >> 10, last_request, delta); + + ret = simple_read_from_buffer(buf, nbytes, ppos, page, count); + +out: + free_page((unsigned long)page); + return ret; +} + +static int netlimit_write_uint(struct cgroup *cont, struct cftype *cft, + u64 val) +{ + struct netlimit *nl; + int ret = 0; + + cgroup_lock(); + if (cgroup_is_removed(cont)) { + ret = -ENODEV; + goto out; + } + + nl = cgroup_to_netlimit(cont); + + spin_lock_irq(&nl->lock[cft->private]); + nl->bandwidth[cft->private] = (unsigned long)val; + nl->req[cft->private] = 0; + nl->last_request[cft->private] = jiffies; + spin_unlock_irq(&nl->lock[cft->private]); + +out: + cgroup_unlock(); + return ret; +} + +static struct cftype files[] = { + CGROUP_NETLIMIT_FILE("tot", CGROUP_NETLIMIT_TOT), + CGROUP_NETLIMIT_FILE("tcp", CGROUP_NETLIMIT_TCP), + CGROUP_NETLIMIT_FILE("udp", CGROUP_NETLIMIT_UDP), + CGROUP_NETLIMIT_FILE("raw", CGROUP_NETLIMIT_RAW), +}; + +static int netlimit_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); +} + +struct cgroup_subsys netlimit_subsys = { + .name = "net", + .create = netlimit_create, + .destroy = netlimit_destroy, + .populate = netlimit_populate, + .subsys_id = netlimit_subsys_id, +}; + +void cgroup_nl_acct(int limit_id, size_t bytes) +{ + struct netlimit *nl; + + nl = task_to_netlimit(current); + if (!nl || !nl->bandwidth[limit_id]) + return; + + nl->req[limit_id] += bytes; +} +EXPORT_SYMBOL(cgroup_nl_acct); + +void cgroup_nl_throttle(int limit_id, int interruptible) +{ + struct netlimit *nl; + unsigned long delta, t; + long sleep; + + nl = task_to_netlimit(current); + if (!nl || !nl->bandwidth[limit_id]) + return; + + delta = (long)jiffies - (long)nl->last_request[limit_id]; + if (!delta) + return; + + t = msecs_to_jiffies(nl->req[limit_id] / nl->bandwidth[limit_id]); + if (!t) + return; + + sleep = t - delta; + if (sleep > 0) { + pr_debug("cgroup-netlimit(%s):" + " task %p (%s) must sleep %lu jiffies\n", + files[limit_id].name, + current, current->comm, sleep); + if (interruptible) + schedule_timeout_interruptible(sleep); + else + schedule_timeout_uninterruptible(sleep); + return; + } + + nl->req[limit_id] = 0; + nl->last_request[limit_id] = jiffies; +} +EXPORT_SYMBOL(cgroup_nl_throttle); diff -urpN linux-2.6.24-rc8/kernel/Makefile linux-2.6.24-rc8-cgroup-netlimit/kernel/Makefile --- linux-2.6.24-rc8/kernel/Makefile 2008-01-16 05:22:48.000000000 +0100 +++ linux-2.6.24-rc8-cgroup-netlimit/kernel/Makefile 2008-01-22 11:39:23.000000000 +0100 @@ -41,6 +41,7 @@ obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o +obj-$(CONFIG_CGROUP_NETLIMIT) += cgroup_netlimit.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o diff -urpN linux-2.6.24-rc8/net/socket.c linux-2.6.24-rc8-cgroup-netlimit/net/socket.c --- linux-2.6.24-rc8/net/socket.c 2008-01-16 05:22:48.000000000 +0100 +++ linux-2.6.24-rc8-cgroup-netlimit/net/socket.c 2008-01-22 21:33:46.000000000 +0100 @@ -85,6 +85,7 @@ #include <linux/audit.h> #include <linux/wireless.h> #include <linux/nsproxy.h> +#include <linux/cgroup_netlimit.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -551,10 +552,33 @@ static inline int __sock_sendmsg(struct si->size = size; err = security_socket_sendmsg(sock, msg, size); - if (err) - return err; + if (!err) + err = sock->ops->sendmsg(iocb, sock, msg, size); - return sock->ops->sendmsg(iocb, sock, msg, size); + if (err >= 0 && sock->sk) { + switch (sock->sk->sk_family) { + case PF_INET: + case PF_INET6: + switch (sock->sk->sk_type) { + case SOCK_STREAM: + cgroup_nl_acct(CGROUP_NETLIMIT_TCP, size); + cgroup_nl_throttle(CGROUP_NETLIMIT_TCP, 1); + break; + case SOCK_DGRAM: + cgroup_nl_acct(CGROUP_NETLIMIT_UDP, size); + cgroup_nl_throttle(CGROUP_NETLIMIT_UDP, 1); + break; + case SOCK_RAW: + cgroup_nl_acct(CGROUP_NETLIMIT_RAW, size); + cgroup_nl_throttle(CGROUP_NETLIMIT_RAW, 1); + break; + } + cgroup_nl_acct(CGROUP_NETLIMIT_TOT, size); + cgroup_nl_throttle(CGROUP_NETLIMIT_TOT, 1); + } + } + + return err; } int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) @@ -633,10 +657,33 @@ static inline int __sock_recvmsg(struct si->flags = flags; err = security_socket_recvmsg(sock, msg, size, flags); - if (err) - return err; + if (!err) + err = sock->ops->recvmsg(iocb, sock, msg, size, flags); - return sock->ops->recvmsg(iocb, sock, msg, size, flags); + if (err >= 0 && sock->sk) { + switch (sock->sk->sk_family) { + case PF_INET: + case PF_INET6: + switch (sock->sk->sk_type) { + case SOCK_STREAM: + cgroup_nl_acct(CGROUP_NETLIMIT_TCP, size); + cgroup_nl_throttle(CGROUP_NETLIMIT_TCP, 1); + break; + case SOCK_DGRAM: + cgroup_nl_acct(CGROUP_NETLIMIT_UDP, size); + cgroup_nl_throttle(CGROUP_NETLIMIT_UDP, 1); + break; + case SOCK_RAW: + cgroup_nl_acct(CGROUP_NETLIMIT_RAW, size); + cgroup_nl_throttle(CGROUP_NETLIMIT_RAW, 1); + break; + } + cgroup_nl_acct(CGROUP_NETLIMIT_TOT, size); + cgroup_nl_throttle(CGROUP_NETLIMIT_TOT, 1); + } + } + + return err; } int sock_recvmsg(struct socket *sock, struct msghdr *msg, -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/