ChangeSet 1.2217.1.23, 2005/03/17 20:00:13-08:00, [EMAIL PROTECTED] [IPV4]: Make multipath algs into true drivers. This also makes them configurable on a per-route basis via rtnetlink route attributes. Based upon suggestions from Thomas Graf and Alexey Kuznetsov. Signed-off-by: David S. Miller <[EMAIL PROTECTED]>
include/linux/ip_mp_alg.h | 22 +++++ include/linux/rtnetlink.h | 1 include/net/ip_fib.h | 10 +- include/net/ip_mp_alg.h | 79 +++++++++++++++++++- include/net/route.h | 73 ------------------ net/ipv4/Kconfig | 26 ++---- net/ipv4/Makefile | 1 net/ipv4/fib_semantics.c | 19 ++++ net/ipv4/multipath.c | 54 +++++++++++++ net/ipv4/multipath_drr.c | 103 ++++++++++---------------- net/ipv4/multipath_random.c | 44 +++++++---- net/ipv4/multipath_rr.c | 34 +++++--- net/ipv4/multipath_wrandom.c | 109 +++++++++++---------------- net/ipv4/route.c | 169 +++++++++++++++++++------------------------ 14 files changed, 402 insertions(+), 342 deletions(-) diff -Nru a/include/linux/ip_mp_alg.h b/include/linux/ip_mp_alg.h --- /dev/null Wed Dec 31 16:00:00 196900 +++ b/include/linux/ip_mp_alg.h 2005-03-18 14:08:05 -08:00 @@ -0,0 +1,22 @@ +/* ip_mp_alg.h: IPV4 multipath algorithm support, user-visible values. + * + * Copyright (C) 2004, 2005 Einar Lueck <[EMAIL PROTECTED]> + * Copyright (C) 2005 David S. Miller <[EMAIL PROTECTED]> + */ + +#ifndef _LINUX_IP_MP_ALG_H +#define _LINUX_IP_MP_ALG_H + +enum ip_mp_alg { + IP_MP_ALG_NONE, + IP_MP_ALG_RR, + IP_MP_ALG_DRR, + IP_MP_ALG_RANDOM, + IP_MP_ALG_WRANDOM, + __IP_MP_ALG_MAX +}; + +#define IP_MP_ALG_MAX (__IP_MP_ALG_MAX - 1) + +#endif /* _LINUX_IP_MP_ALG_H */ + diff -Nru a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h --- a/include/linux/rtnetlink.h 2005-03-18 14:08:05 -08:00 +++ b/include/linux/rtnetlink.h 2005-03-18 14:08:05 -08:00 @@ -250,6 +250,7 @@ RTA_FLOW, RTA_CACHEINFO, RTA_SESSION, + RTA_MP_ALGO, __RTA_MAX }; diff -Nru a/include/net/ip_fib.h b/include/net/ip_fib.h --- a/include/net/ip_fib.h 2005-03-18 14:08:05 -08:00 +++ b/include/net/ip_fib.h 2005-03-18 14:08:05 -08:00 @@ -37,6 +37,7 @@ u32 *rta_flow; struct rta_cacheinfo *rta_ci; struct rta_session *rta_sess; + u32 *rta_mp_alg; }; struct fib_info; @@ -81,6 +82,9 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_power; #endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + u32 fib_mp_alg; +#endif struct fib_nh fib_nh[0]; #define fib_dev fib_nh[0].nh_dev }; @@ -95,7 +99,7 @@ unsigned char nh_sel; unsigned char type; unsigned char scope; -#ifdef CONFIG_IP_ROUTE_MULTIPATH_WRANDOM +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED __u32 network; __u32 netmask; #endif @@ -123,10 +127,10 @@ #define FIB_RES_DEV(res) (FIB_RES_NH(res).nh_dev) #define FIB_RES_OIF(res) (FIB_RES_NH(res).nh_oif) -#ifdef CONFIG_IP_ROUTE_MULTIPATH_WRANDOM +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED #define FIB_RES_NETWORK(res) ((res).network) #define FIB_RES_NETMASK(res) ((res).netmask) -#else /* CONFIG_IP_ROUTE_MULTIPATH_WRANDOM */ +#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ #define FIB_RES_NETWORK(res) (0) #define FIB_RES_NETMASK(res) (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH_WRANDOM */ diff -Nru a/include/net/ip_mp_alg.h b/include/net/ip_mp_alg.h --- a/include/net/ip_mp_alg.h 2005-03-18 14:08:05 -08:00 +++ b/include/net/ip_mp_alg.h 2005-03-18 14:08:05 -08:00 @@ -4,13 +4,84 @@ * Copyright (C) 2005 David S. Miller <[EMAIL PROTECTED]> */ -#ifndef _IP_MP_ALG_H -#define _IP_MP_ALG_H +#ifndef _NET_IP_MP_ALG_H +#define _NET_IP_MP_ALG_H #include <linux/config.h> +#include <linux/ip_mp_alg.h> #include <net/flow.h> +#include <net/route.h> -static int inline multipath_comparekeys(const struct flowi *flp1, +struct fib_nh; + +struct ip_mp_alg_ops { + void (*mp_alg_select_route)(const struct flowi *flp, + struct rtable *rth, struct rtable **rp); + void (*mp_alg_flush)(void); + void (*mp_alg_set_nhinfo)(__u32 network, __u32 netmask, + unsigned char prefixlen, + const struct fib_nh *nh); + void (*mp_alg_remove)(struct rtable *rth); +}; + +extern int multipath_alg_register(struct ip_mp_alg_ops *, enum ip_mp_alg); +extern void multipath_alg_unregister(struct ip_mp_alg_ops *, enum ip_mp_alg); + +extern struct ip_mp_alg_ops *ip_mp_alg_table[]; + +static inline int multipath_select_route(const struct flowi *flp, + struct rtable *rth, + struct rtable **rp) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + struct ip_mp_alg_ops *ops = ip_mp_alg_table[rth->rt_multipath_alg]; + + if (ops && (rth->u.dst.flags & DST_BALANCED)) { + ops->mp_alg_select_route(flp, rth, rp); + return 1; + } +#endif + return 0; +} + +static inline void multipath_flush(void) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + int i; + + for (i = IP_MP_ALG_NONE; i <= IP_MP_ALG_MAX; i++) { + struct ip_mp_alg_ops *ops = ip_mp_alg_table[i]; + + if (ops) + ops->mp_alg_flush(); + } +#endif +} + +static inline void multipath_set_nhinfo(struct rtable *rth, + __u32 network, __u32 netmask, + unsigned char prefixlen, + const struct fib_nh *nh) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + struct ip_mp_alg_ops *ops = ip_mp_alg_table[rth->rt_multipath_alg]; + + if (ops) + ops->mp_alg_set_nhinfo(network, netmask, prefixlen, nh); +#endif +} + +static inline void multipath_remove(struct rtable *rth) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + struct ip_mp_alg_ops *ops = ip_mp_alg_table[rth->rt_multipath_alg]; + + if (ops && (rth->u.dst.flags & DST_BALANCED)) + ops->mp_alg_remove(rth); +#endif +} + +static inline int multipath_comparekeys(const struct flowi *flp1, const struct flowi *flp2) { return flp1->fl4_dst == flp2->fl4_dst && @@ -23,4 +94,4 @@ (IPTOS_RT_MASK | RTO_ONLINK)); } -#endif /* _IP_MP_ALG_H */ +#endif /* _NET_IP_MP_ALG_H */ diff -Nru a/include/net/route.h b/include/net/route.h --- a/include/net/route.h 2005-03-18 14:08:05 -08:00 +++ b/include/net/route.h 2005-03-18 14:08:05 -08:00 @@ -202,77 +202,4 @@ return rt->peer; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH_WRANDOM -extern void __multipath_flush(void); -static inline void multipath_flush(void) -{ - __multipath_flush(); -} -#else /* CONFIG_IP_ROUTE_MULTIPATH_WRANDOM */ -static inline void multipath_flush(void) -{ -} -#endif /* CONFIG_IP_ROUTE_MULTIPATH_WRANDOM */ - -#ifdef CONFIG_IP_ROUTE_MULTIPATH_WRANDOM -extern void __multipath_set_nhinfo(__u32 network, - __u32 netmask, - unsigned char prefixlen, - const struct fib_nh* nh); -static inline void multipath_set_nhinfo(__u32 network, - __u32 netmask, - unsigned char prefixlen, - const struct fib_nh* nh) -{ - __multipath_set_nhinfo(network, netmask, prefixlen, nh); -} -#else -static inline void multipath_set_nhinfo(__u32 network, - __u32 netmask, - unsigned char prefixlen, - const struct fib_nh* nh) -{ -} -#endif - - - -#if defined(CONFIG_IP_ROUTE_MULTIPATH_RR) || defined(CONFIG_IP_ROUTE_MULTIPATH_DRR) -extern void __multipath_remove(struct rtable *rt); -static inline void multipath_remove(struct rtable *rt) -{ - if ( rt->u.dst.flags & DST_BALANCED ) - __multipath_remove(rt); -} -#else /* CONFIG_IP_ROUTE_MULTIPATH_RR || CONFIG_IP_ROUTE_MULTIPATH_DRR */ -static inline void multipath_remove(struct rtable *rt) -{ -} -#endif /* CONFIG_IP_ROUTE_MULTIPATH_RR || CONFIG_IP_ROUTE_MULTIPATH_DRR */ - - -#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED -extern void __multipath_selectroute(const struct flowi *flp, - struct rtable *rth, - struct rtable **rp); -static inline int multipath_selectroute(const struct flowi *flp, - struct rtable *rth, - struct rtable **rp) -{ - if (rth->u.dst.flags & DST_BALANCED) { - __multipath_selectroute(flp, rth, rp); - return 1; - } else { - return 0; - } -} -#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ -static inline int multipath_selectroute(const struct flowi *flp, - struct rtable *rth, - struct rtable **rp) -{ - return 0; -} -#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ - #endif /* _ROUTE_H */ diff -Nru a/net/ipv4/Kconfig b/net/ipv4/Kconfig --- a/net/ipv4/Kconfig 2005-03-18 14:08:05 -08:00 +++ b/net/ipv4/Kconfig 2005-03-18 14:08:05 -08:00 @@ -100,43 +100,37 @@ If unsure, say N. -# -# multipath policy configuration -# -choice - prompt "Multipath policy" - depends on IP_ROUTE_MULTIPATH_CACHED - default IP_ROUTE_MULTIPATH_RANDOM - config IP_ROUTE_MULTIPATH_RR - bool "round robin (EXPERIMENTAL)" + tristate "MULTIPATH: round robin algorithm" + depends on IP_ROUTE_MULTIPATH_CACHED help Mulitpath routes are chosen according to Round Robin config IP_ROUTE_MULTIPATH_RANDOM - bool "random multipath (EXPERIMENTAL)" + tristate "MULTIPATH: random algorithm" + depends on IP_ROUTE_MULTIPATH_CACHED help Multipath routes are chosen in a random fashion. Actually, there is no weight for a route. The advantage of this policy is that it is implemented stateless and therefore introduces only a very small delay. + config IP_ROUTE_MULTIPATH_WRANDOM - bool "weighted random multipath (EXPERIMENTAL)" + tristate "MULTIPATH: weighted random algorithm" + depends on IP_ROUTE_MULTIPATH_CACHED help Multipath routes are chosen in a weighted random fashion. The per route weights are the weights visible via ip route 2. As the corresponding state management introduces some overhead routing delay is increased. + config IP_ROUTE_MULTIPATH_DRR - bool "interface round robin (EXPERIMENTAL)" + tristate "MULTIPATH: interface round robin algorithm" + depends on IP_ROUTE_MULTIPATH_CACHED help Connections are distributed in a round robin fashion over the available interfaces. This policy makes sense if the connections should be primarily distributed on interfaces and not on routes. -endchoice -# -# END OF multipath policy configuration -# config IP_ROUTE_VERBOSE bool "IP: verbose route monitoring" diff -Nru a/net/ipv4/Makefile b/net/ipv4/Makefile --- a/net/ipv4/Makefile 2005-03-18 14:08:05 -08:00 +++ b/net/ipv4/Makefile 2005-03-18 14:08:05 -08:00 @@ -27,6 +27,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o +obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff -Nru a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c --- a/net/ipv4/fib_semantics.c 2005-03-18 14:08:05 -08:00 +++ b/net/ipv4/fib_semantics.c 2005-03-18 14:08:05 -08:00 @@ -42,6 +42,7 @@ #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/ip_mp_alg.h> #include "fib_lookup.h" @@ -649,6 +650,9 @@ #else const int nhs = 1; #endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + u32 mp_alg = IP_MP_ALG_NONE; +#endif /* Fast check to catch the most weird cases */ if (fib_props[r->rtm_type].scope > r->rtm_scope) @@ -661,6 +665,15 @@ goto err_inval; } #endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (rta->rta_mp_alg) { + mp_alg = *rta->rta_mp_alg; + + if (mp_alg < IP_MP_ALG_NONE || + mp_alg > IP_MP_ALG_MAX) + goto err_inval; + } +#endif err = -ENOBUFS; if (fib_info_cnt >= fib_hash_size) { @@ -752,6 +765,10 @@ #endif } +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + fi->fib_mp_alg = mp_alg; +#endif + if (fib_props[r->rtm_type].error) { if (rta->rta_gw || rta->rta_oif || rta->rta_mp) goto err_inval; @@ -896,7 +913,7 @@ res->type = fa->fa_type; res->scope = fa->fa_scope; res->fi = fa->fa_info; -#ifdef CONFIG_IP_ROUTE_MULTIPATH_WRANDOM +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED res->netmask = mask; res->network = zone & (0xFFFFFFFF >> (32 - prefixlen)); diff -Nru a/net/ipv4/multipath.c b/net/ipv4/multipath.c --- /dev/null Wed Dec 31 16:00:00 196900 +++ b/net/ipv4/multipath.c 2005-03-18 14:08:05 -08:00 @@ -0,0 +1,54 @@ +/* multipath.c: IPV4 multipath algorithm support. + * + * Copyright (C) 2004, 2005 Einar Lueck <[EMAIL PROTECTED]> + * Copyright (C) 2005 David S. Miller <[EMAIL PROTECTED]> + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/spinlock.h> + +#include <net/ip_mp_alg.h> + +static DEFINE_SPINLOCK(alg_table_lock); +struct ip_mp_alg_ops *ip_mp_alg_table[IP_MP_ALG_MAX]; + +int multipath_alg_register(struct ip_mp_alg_ops *ops, enum ip_mp_alg n) +{ + struct ip_mp_alg_ops **slot; + int err; + + if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX) + return -EINVAL; + + spin_lock(&alg_table_lock); + slot = &ip_mp_alg_table[n]; + if (*slot != NULL) { + err = -EBUSY; + } else { + *slot = ops; + err = 0; + } + spin_unlock(&alg_table_lock); + + return err; +} +EXPORT_SYMBOL(multipath_alg_register); + +void multipath_alg_unregister(struct ip_mp_alg_ops *ops, enum ip_mp_alg n) +{ + struct ip_mp_alg_ops **slot; + + if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX) + return; + + spin_lock(&alg_table_lock); + slot = &ip_mp_alg_table[n]; + if (*slot == ops) + *slot = NULL; + spin_unlock(&alg_table_lock); + + synchronize_net(); +} +EXPORT_SYMBOL(multipath_alg_unregister); diff -Nru a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c --- a/net/ipv4/multipath_drr.c 2005-03-18 14:08:05 -08:00 +++ b/net/ipv4/multipath_drr.c 2005-03-18 14:08:05 -08:00 @@ -56,12 +56,9 @@ #define MULTIPATH_MAX_DEVICECANDIDATES 10 static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES]; -static spinlock_t state_lock = SPIN_LOCK_UNLOCKED; -static int registered_dev_notifier = 0; +static DEFINE_SPINLOCK(state_lock); static struct rtable *last_selection = NULL; -#define RTprint(a...) // printk(KERN_DEBUG a) - static int inline __multipath_findslot(void) { int i; @@ -85,8 +82,8 @@ return -1; } -static int multipath_dev_event(struct notifier_block *this, - unsigned long event, void *ptr) +static int drr_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) { struct net_device *dev = ptr; int devidx; @@ -101,12 +98,6 @@ state[devidx].allocated = 0; state[devidx].ifi = 0; atomic_set(&state[devidx].usecount, 0); - RTprint(KERN_DEBUG"%s: successfully removed device " \ - "with index %d\n",__FUNCTION__, devidx); - } else { - RTprint(KERN_DEBUG"%s: Device not relevant for " \ - " multipath: %d\n", - __FUNCTION__, devidx); } spin_unlock_bh(&state_lock); @@ -116,17 +107,17 @@ return NOTIFY_DONE; } -struct notifier_block multipath_dev_notifier = { - .notifier_call = multipath_dev_event, +struct notifier_block drr_dev_notifier = { + .notifier_call = drr_dev_event, }; -void __multipath_remove(struct rtable *rt) +static void drr_remove(struct rtable *rt) { if (last_selection == rt) last_selection = NULL; } -void __multipath_safe_inc(atomic_t *usecount) +static void drr_safe_inc(atomic_t *usecount) { int n; @@ -136,9 +127,6 @@ if (n <= 0) { int i; - RTprint("%s: detected overflow, now ill will reset all "\ - "usecounts\n", __FUNCTION__); - spin_lock_bh(&state_lock); for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) @@ -148,7 +136,7 @@ } } -void __multipath_selectroute(const struct flowi *flp, +static void drr_select_route(const struct flowi *flp, struct rtable *first, struct rtable **rp) { struct rtable *nh, *result, *cur_min; @@ -156,16 +144,9 @@ int devidx = -1; int cur_min_devidx = -1; - /* register a notifier to stay informed about dying devices */ - if (!registered_dev_notifier) { - registered_dev_notifier = 1; - register_netdevice_notifier(&multipath_dev_notifier); - } - /* if necessary and possible utilize the old alternative */ if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 && last_selection != NULL) { - RTprint( KERN_CRIT"%s: holding route \n", __FUNCTION__ ); result = last_selection; *rp = result; return; @@ -206,9 +187,6 @@ devidx = __multipath_findslot(); if (devidx == -1) { /* unlikely but possible */ - RTprint(KERN_DEBUG"%s: " \ - "out of space\n", - __FUNCTION__); continue; } @@ -216,13 +194,6 @@ state[devidx].ifi = nh_ifidx; atomic_set(&state[devidx].usecount, 0); min_usecount = 0; - RTprint(KERN_DEBUG"%s: created " \ - " for " \ - "device %d and " \ - "min_usecount " \ - " == -1\n", - __FUNCTION__, - nh_ifidx); } spin_unlock_bh(&state_lock); @@ -232,11 +203,7 @@ /* if the device has not been used it is * the primary target */ - RTprint(KERN_DEBUG"%s: now setting " \ - "result to device %d\n", - __FUNCTION__, nh_ifidx ); - - __multipath_safe_inc(&state[devidx].usecount); + drr_safe_inc(&state[devidx].usecount); result = nh; } else { int count = @@ -247,13 +214,6 @@ cur_min = nh; cur_min_devidx = devidx; min_usecount = count; - - RTprint(KERN_DEBUG"%s: found " \ - "device " \ - "%d with usecount == %d\n", - __FUNCTION__, - nh_ifidx, - min_usecount); } } } @@ -261,24 +221,45 @@ if (!result) { if (cur_min) { - RTprint( KERN_DEBUG"%s: index of device in state "\ - "array: %d\n", - __FUNCTION__, cur_min_devidx ); - __multipath_safe_inc(&state[cur_min_devidx].usecount); + drr_safe_inc(&state[cur_min_devidx].usecount); result = cur_min; } else { - RTprint( KERN_DEBUG"%s: utilized first\n", - __FUNCTION__); result = first; } - } else { - RTprint(KERN_DEBUG"%s: utilize result: found device " \ - "%d with usecount == %d\n", - __FUNCTION__, result->u.dst.dev->ifindex, - min_usecount); - } *rp = result; last_selection = result; } + +static struct ip_mp_alg_ops drr_ops = { + .mp_alg_select_route = drr_select_route, + .mp_alg_remove = drr_remove, +}; + +static int __init drr_init(void) +{ + int err = register_netdevice_notifier(&drr_dev_notifier); + + if (err) + return err; + + err = multipath_alg_register(&drr_ops, IP_MP_ALG_RR); + if (err) + goto fail; + + return 0; + +fail: + unregister_netdevice_notifier(&drr_dev_notifier); + return err; +} + +static void __exit drr_exit(void) +{ + unregister_netdevice_notifier(&drr_dev_notifier); + multipath_alg_unregister(&drr_ops, IP_MP_ALG_DRR); +} + +module_init(drr_init); +module_exit(drr_exit); diff -Nru a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c --- a/net/ipv4/multipath_random.c 2005-03-18 14:08:05 -08:00 +++ b/net/ipv4/multipath_random.c 2005-03-18 14:08:05 -08:00 @@ -47,17 +47,26 @@ #include <net/checksum.h> #include <net/ip_mp_alg.h> -#define RTprint(a...) // printk(KERN_DEBUG a) - #define MULTIPATH_MAX_CANDIDATES 40 /* interface to random number generation */ static unsigned int RANDOM_SEED = 93186752; -static inline unsigned int random(unsigned int ubound); -void __multipath_selectroute(const struct flowi *flp, - struct rtable *first, - struct rtable **rp) +static inline unsigned int random(unsigned int ubound) +{ + static unsigned int a = 1588635695, + q = 2, + r = 1117695901; + + RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q); + + return RANDOM_SEED % ubound; +} + + +static void random_select_route(const struct flowi *flp, + struct rtable *first, + struct rtable **rp) { struct rtable *rt; struct rtable *decision; @@ -78,9 +87,6 @@ unsigned char candidate_no = (unsigned char) random(candidate_count); - RTprint( "%s: randomly chosen candidate: %d (count: %d)\n", - __FUNCTION__, candidate_no, candidate_count ); - /* find chosen candidate and adjust GC data for all candidates * to ensure they stay in cache */ @@ -104,13 +110,19 @@ *rp = decision; } -static inline unsigned int random(unsigned int ubound) -{ - static unsigned int a = 1588635695, - q = 2, - r = 1117695901; +static struct ip_mp_alg_ops random_ops = { + .mp_alg_select_route = random_select_route, +}; - RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q); +static int __init random_init(void) +{ + return multipath_alg_register(&random_ops, IP_MP_ALG_RANDOM); +} - return RANDOM_SEED % ubound; +static void __exit random_exit(void) +{ + multipath_alg_unregister(&random_ops, IP_MP_ALG_RANDOM); } + +module_init(random_init); +module_exit(random_exit); diff -Nru a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c --- a/net/ipv4/multipath_rr.c 2005-03-18 14:08:05 -08:00 +++ b/net/ipv4/multipath_rr.c 2005-03-18 14:08:05 -08:00 @@ -47,29 +47,25 @@ #include <net/checksum.h> #include <net/ip_mp_alg.h> -#define RTprint(a...) // printk(KERN_DEBUG a) - #define MULTIPATH_MAX_CANDIDATES 40 static struct rtable* last_used = NULL; -void __multipath_remove(struct rtable *rt) +static void rr_remove(struct rtable *rt) { if (last_used == rt) last_used = NULL; } -void __multipath_selectroute(const struct flowi *flp, - struct rtable *first, struct rtable **rp) +static void rr_select_route(const struct flowi *flp, + struct rtable *first, struct rtable **rp) { struct rtable *nh, *result, *min_use_cand = NULL; int min_use = -1; /* if necessary and possible utilize the old alternative */ - if ( ( flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE ) != 0 && - last_used != NULL ) { - RTprint( KERN_CRIT"%s: holding route \n", - __FUNCTION__ ); + if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 && + last_used != NULL) { result = last_used; goto out; } @@ -88,8 +84,6 @@ min_use = nh->u.dst.__use; min_use_cand = nh; } - RTprint( KERN_CRIT"%s: found balanced entry\n", - __FUNCTION__ ); } } result = min_use_cand; @@ -101,3 +95,21 @@ result->u.dst.__use++; *rp = result; } + +static struct ip_mp_alg_ops rr_ops = { + .mp_alg_select_route = rr_select_route, + .mp_alg_remove = rr_remove, +}; + +static int __init rr_init(void) +{ + return multipath_alg_register(&rr_ops, IP_MP_ALG_RR); +} + +static void __exit rr_exit(void) +{ + multipath_alg_unregister(&rr_ops, IP_MP_ALG_RR); +} + +module_init(rr_init); +module_exit(rr_exit); diff -Nru a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c --- a/net/ipv4/multipath_wrandom.c 2005-03-18 14:08:05 -08:00 +++ b/net/ipv4/multipath_wrandom.c 2005-03-18 14:08:05 -08:00 @@ -48,8 +48,6 @@ #include <net/ip_fib.h> #include <net/ip_mp_alg.h> -#define MPprint(a...) // printk(KERN_DEBUG a) - #define MULTIPATH_STATE_SIZE 15 struct multipath_candidate { @@ -85,13 +83,19 @@ }; /* state: primarily weight per route information */ -static int multipath_state_initialized = 0; -static spinlock_t state_big_lock = SPIN_LOCK_UNLOCKED; static struct multipath_bucket state[MULTIPATH_STATE_SIZE]; /* interface to random number generation */ static unsigned int RANDOM_SEED = 93186752; -static __inline__ unsigned int random(unsigned int ubound); + +static inline unsigned int random(unsigned int ubound) +{ + static unsigned int a = 1588635695, + q = 2, + r = 1117695901; + RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q); + return RANDOM_SEED % ubound; +} static unsigned char __multipath_lookup_weight(const struct flowi *fl, const struct rtable *rt) @@ -129,8 +133,6 @@ if ((targetnetwork & d->netmask) == d->network) { weight = d->nh_info->nh_weight; - MPprint("%s: found weight %d for gateway %u\n", - __FUNCTION__, weight, rt->rt_gateway); goto out; } } @@ -140,36 +142,19 @@ return weight; } -static void __multipath_init_state(void) +static void wrandom_init_state(void) { - spin_lock(&state_big_lock); + int i; - /* check again due to SMP and to prevent contention */ - if (!multipath_state_initialized) { - int i; - - for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) { - INIT_LIST_HEAD(&state[i].head); - state[i].lock = SPIN_LOCK_UNLOCKED; - } + for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) { + INIT_LIST_HEAD(&state[i].head); + spin_lock_init(&state[i].lock); } - - /* now mark initialized */ - multipath_state_initialized = 1; - - spin_unlock(&state_big_lock); } -static void inline __multipath_init(void) -{ - /* do not spinlock to reduce unnecessary contention */ - if (!multipath_state_initialized) - __multipath_init_state(); -} - -void __multipath_selectroute(const struct flowi *flp, - struct rtable *first, - struct rtable **rp) +static void wrandom_select_route(const struct flowi *flp, + struct rtable *first, + struct rtable **rp) { struct rtable *rt; struct rtable *decision; @@ -180,9 +165,6 @@ int selector; const size_t size_mpc = sizeof(struct multipath_candidate); - /* init state if necessary */ - __multipath_init(); - /* collect all candidates and identify their weights */ for (rt = rcu_dereference(first); rt; rt = rcu_dereference(rt->u.rt_next)) { @@ -192,6 +174,9 @@ (struct multipath_candidate*) kmalloc(size_mpc, GFP_KERNEL); + if (!mpc) + return; + power += __multipath_lookup_weight(flp, rt) * 10000; mpc->power = power; @@ -210,8 +195,6 @@ /* choose a weighted random candidate */ decision = first; selector = random(power); - MPprint("%s: random number %d in range %d\n", __FUNCTION__, selector, - power); last_power = 0; /* select candidate, adjust GC data and cleanup local state */ @@ -219,13 +202,9 @@ last_mpc = NULL; for (mpc = first_mpc; mpc; mpc = mpc->next) { mpc->rt->u.dst.lastuse = jiffies; - MPprint("%s: last_power = %d, selector: %d, mpc->power: %d\n", - __FUNCTION__, last_power, selector, mpc->power); - if (last_power <= selector && selector < mpc->power) { + if (last_power <= selector && selector < mpc->power) decision = mpc->rt; - MPprint("%s: selected %u\n", __FUNCTION__, - decision->rt_gateway); - } + last_power = mpc->power; if (last_mpc) kfree(last_mpc); @@ -242,18 +221,15 @@ *rp = decision; } -void __multipath_set_nhinfo(__u32 network, - __u32 netmask, - unsigned char prefixlen, - const struct fib_nh* nh) +static void wrandom_set_nhinfo(__u32 network, + __u32 netmask, + unsigned char prefixlen, + const struct fib_nh *nh) { const int state_idx = nh->nh_oif % MULTIPATH_STATE_SIZE; struct multipath_route *r, *target_route = NULL; struct multipath_dest *d, *target_dest = NULL; - /* init state if necessary */ - __multipath_init(); - /* store the weight information for a certain route */ spin_lock(&state[state_idx].lock); @@ -321,20 +297,15 @@ kfree(dst); } -void __multipath_flush(void) +static void wrandom_flush(void) { int i; - MPprint("%s: called\n", __FUNCTION__); - - /* init state if necessary */ - __multipath_init(); - /* defere delete to all entries */ for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) { struct multipath_route *r; - spin_lock(&state[i].lock); + spin_lock(&state[i].lock); list_for_each_entry_rcu(r, &state[i].head, list) { struct multipath_dest *d; list_for_each_entry_rcu(d, &r->dests, list) { @@ -349,15 +320,25 @@ spin_unlock(&state[i].lock); } +} + +static struct ip_mp_alg_ops wrandom_ops = { + .mp_alg_select_route = wrandom_select_route, + .mp_alg_flush = wrandom_flush, + .mp_alg_set_nhinfo = wrandom_set_nhinfo, +}; + +static int __init wrandom_init(void) +{ + wrandom_init_state(); - MPprint("%s: finished\n", __FUNCTION__); + return multipath_alg_register(&wrandom_ops, IP_MP_ALG_WRANDOM); } -static __inline__ unsigned int random(unsigned int ubound) +static void __exit wrandom_exit(void) { - static unsigned int a = 1588635695, - q = 2, - r = 1117695901; - RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q); - return RANDOM_SEED % ubound; + multipath_alg_unregister(&wrandom_ops, IP_MP_ALG_WRANDOM); } + +module_init(wrandom_init); +module_exit(wrandom_exit); diff -Nru a/net/ipv4/route.c b/net/ipv4/route.c --- a/net/ipv4/route.c 2005-03-18 14:08:05 -08:00 +++ b/net/ipv4/route.c 2005-03-18 14:08:05 -08:00 @@ -100,6 +100,7 @@ #include <net/tcp.h> #include <net/icmp.h> #include <net/xfrm.h> +#include <net/ip_mp_alg.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -129,7 +130,7 @@ static int ip_rt_secret_interval = 10 * 60 * HZ; static unsigned long rt_deadline; -#define RTprint(a...) // printk(KERN_DEBUG a) +#define RTprint(a...) printk(KERN_DEBUG a) static struct timer_list rt_flush_timer; static struct timer_list rt_periodic_timer; @@ -451,13 +452,13 @@ static __inline__ void rt_free(struct rtable *rt) { - multipath_remove( rt ); + multipath_remove(rt); call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); } static __inline__ void rt_drop(struct rtable *rt) { - multipath_remove( rt ); + multipath_remove(rt); ip_rt_put(rt); call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); } @@ -522,37 +523,36 @@ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED static struct rtable **rt_remove_balanced_route(struct rtable **chain_head, struct rtable *expentry, - int* removed_count) + int *removed_count) { int passedexpired = 0; struct rtable **nextstep = NULL; struct rtable **rthp = chain_head; struct rtable *rth; + if (removed_count) *removed_count = 0; + while ((rth = *rthp) != NULL) { - if ( rth == expentry ) { + if (rth == expentry) passedexpired = 1; - } if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 && compare_keys(&(*rthp)->fl, &expentry->fl)) { if (*rthp == expentry) { *rthp = rth->u.rt_next; continue; - } - else { + } else { *rthp = rth->u.rt_next; rt_free(rth); if (removed_count) ++(*removed_count); } - } - else { - if ( !((*rthp)->u.dst.flags & DST_BALANCED) && - passedexpired && !nextstep ) { + } else { + if (!((*rthp)->u.dst.flags & DST_BALANCED) && + passedexpired && !nextstep) nextstep = &rth->u.rt_next; - } + rthp = &rth->u.rt_next; } } @@ -560,11 +560,10 @@ rt_free(expentry); if (removed_count) ++(*removed_count); - + return nextstep; } - -#endif +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ /* This runs via a timer and thus is always in BH context. */ @@ -600,15 +599,13 @@ /* Cleanup aged off entries. */ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED /* remove all related balanced entries if necessary */ - if ( rth->u.dst.flags & DST_BALANCED ) { + if (rth->u.dst.flags & DST_BALANCED) { rthp = rt_remove_balanced_route( &rt_hash_table[i].chain, rth, NULL); - if (!rthp) { + if (!rthp) break; - } - } - else { + } else { *rthp = rth->u.rt_next; rt_free(rth); } @@ -785,19 +782,20 @@ continue; } #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - /* remove all related balanced entries if necessary */ - if ( rth->u.dst.flags & DST_BALANCED ) { + /* remove all related balanced entries + * if necessary + */ + if (rth->u.dst.flags & DST_BALANCED) { int r; + rthp = rt_remove_balanced_route( &rt_hash_table[i].chain, rth, &r); goal -= r; - if (!rthp) { + if (!rthp) break; - } - } - else { + } else { *rthp = rth->u.rt_next; rt_free(rth); goal--; @@ -1724,7 +1722,7 @@ rth->u.dst.flags= DST_HOST; #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - if ( res->fi->fib_nhs > 1 ) + if (res->fi->fib_nhs > 1) rth->u.dst.flags |= DST_BALANCED; #endif if (in_dev->cnf.no_policy) @@ -1795,65 +1793,57 @@ struct in_device *in_dev, u32 daddr, u32 saddr, u32 tos) { -#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED struct rtable* rth; unsigned char hop, hopcount, lasthop; int err = -EINVAL; - unsigned hash; - if (res->fi) { + unsigned int hash; + + if (res->fi) hopcount = res->fi->fib_nhs; - } - else { + else hopcount = 1; - } + lasthop = hopcount - 1; /* distinguish between multipath and singlepath */ - if ( hopcount < 2 ) - return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, + if (hopcount < 2) + return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos); - RTprint( KERN_DEBUG"%s: entered (hopcount: %d)\n", __FUNCTION__, - hopcount); - /* add all alternatives to the routing cache */ - for ( hop = 0; hop < hopcount; ++hop ) { + for (hop = 0; hop < hopcount; hop++) { res->nh_sel = hop; - RTprint( KERN_DEBUG"%s: entered (hopcount: %d)\n", - __FUNCTION__, hopcount); - /* create a routing cache entry */ - err = __mkroute_input( skb, res, in_dev, daddr, saddr, tos, - &rth ); - if ( err ) + err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, + &rth); + if (err) return err; - /* put it into the cache */ hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos); err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); - if ( err ) + if (err) return err; - + /* forward hop information to multipath impl. */ - multipath_set_nhinfo(FIB_RES_NETWORK(*res), + multipath_set_nhinfo(rth, + FIB_RES_NETWORK(*res), FIB_RES_NETMASK(*res), res->prefixlen, &FIB_RES_NH(*res)); - - /* only for the last hop the reference count is handled - outside */ - RTprint( KERN_DEBUG"%s: balanced entry created: %d\n", - __FUNCTION__, rth ); - if ( hop == lasthop ) + /* only for the last hop the reference count is handled + * outside + */ + if (hop == lasthop) atomic_set(&(skb->dst->__refcnt), 1); } return err; -#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ +#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos); -#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ } @@ -2175,8 +2165,11 @@ rth->u.dst.flags= DST_HOST; #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - if (res->fi && res->fi->fib_nhs > 1) - rth->u.dst.flags |= DST_BALANCED; + if (res->fi) { + rth->rt_multipath_alg = res->fi->fib_mp_alg; + if (res->fi->fib_nhs > 1) + rth->u.dst.flags |= DST_BALANCED; + } #endif if (in_dev->cnf.no_xfrm) rth->u.dst.flags |= DST_NOXFRM; @@ -2274,17 +2267,13 @@ unsigned char hop; unsigned hash; int err = -EINVAL; - struct rtable* rth; + struct rtable *rth; - if (res->fi && res->fi->fib_nhs > 1) { + if (res->fi && res->fi->fib_nhs > 1) { unsigned char hopcount = res->fi->fib_nhs; - RTprint( KERN_DEBUG"%s: entered (hopcount: %d, fl->oif: %d)\n", - __FUNCTION__, hopcount, fl->oif); - for ( hop = 0; hop < hopcount; ++hop ) { + for (hop = 0; hop < hopcount; hop++) { struct net_device *dev2nexthop; - RTprint( KERN_DEBUG"%s: hop %d of %d\n", __FUNCTION__, - hop, hopcount ); res->nh_sel = hop; @@ -2292,49 +2281,34 @@ dev2nexthop = FIB_RES_DEV(*res); dev_hold(dev2nexthop); - err = __mkroute_output(&rth, res, fl, oldflp, + err = __mkroute_output(&rth, res, fl, oldflp, dev2nexthop, flags); - /** FIXME remove debug code */ - RTprint( "%s: balanced entry created: %d " \ - " (GW: %u)\n", - __FUNCTION__, - &rth->u.dst, - FIB_RES_GW(*res) ); - - if ( err != 0 ) { + if (err != 0) goto cleanup; - } - RTprint( KERN_DEBUG"%s: created successfully %d\n", - __FUNCTION__, hop ); - hash = rt_hash_code(oldflp->fl4_dst, - oldflp->fl4_src ^ + oldflp->fl4_src ^ (oldflp->oif << 5), tos); err = rt_intern_hash(hash, rth, rp); - RTprint( KERN_DEBUG"%s: hashed %d\n", - __FUNCTION__, hop ); /* forward hop information to multipath impl. */ - multipath_set_nhinfo(FIB_RES_NETWORK(*res), + multipath_set_nhinfo(rth, + FIB_RES_NETWORK(*res), FIB_RES_NETMASK(*res), res->prefixlen, &FIB_RES_NH(*res)); cleanup: /* release work reference to output device */ dev_put(dev2nexthop); - - if ( err != 0 ) { + + if (err != 0) return err; - } } - RTprint( "%s: exited loop\n", __FUNCTION__ ); atomic_set(&(*rp)->u.dst.__refcnt, 1); return err; - } - else { - return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, + } else { + return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags); } #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ @@ -2557,9 +2531,11 @@ #endif !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK))) { + /* check for multipath routes and choose one if - necessary */ - if (multipath_selectroute(flp, rth, rp)) { + * necessary + */ + if (multipath_select_route(flp, rth, rp)) { dst_hold(&(*rp)->u.dst); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); @@ -2639,6 +2615,13 @@ #ifdef CONFIG_NET_CLS_ROUTE if (rt->u.dst.tclassid) RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid); +#endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (rt->rt_multipath_alg != IP_MP_ALG_NONE) { + __u32 alg = rt->rt_multipath_alg; + + RTA_PUT(skb, RTA_MP_ALGO, 4, &alg); + } #endif if (rt->fl.iif) RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); - To unsubscribe from this list: send the line "unsubscribe bk-commits-head" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html