Hello,

I've discovered a bug in the bonding module of the Linux Kernel, which
appears 
only in bonding-mode balance-alb.

Description:

    You have to setup a box with at least two NICs, a bonding device
enslaving
    those, assign at least two IPs to the bond and make some traffic from a
    different machine to one of those IPs.

    If you delete that IP, the box will regardlessly send ARP-replies to the
    machine which communicated to that IP before removing it.

    This comes from the rx_hashtbl and the receive load balancing algorithm.

    The bug is very serious if bonding is used in a cluster-environment
using
    two nodes which are connected to the same subnet. If an IP-bound service
has
    to failover to the other node, the old node would announce its
MAC-address
    for the IP which isn't owned by the node anymore. So client-traffic in
the
    same net would hit the old node.
    
    A possible workaround could be the usage of balance-tlb instead of
    balance-alb.

I've made a little patch which removes every entry from the rx_hashtbl, if
the
according IP is removed from the bond. The patch was made for Linux Kernel
version 2.6.19.

---8<---
diff -ur linux-2.6.19/drivers/net/bonding/bond_alb.c
linux/drivers/net/bonding/bond_alb.c
--- linux-2.6.19/drivers/net/bonding/bond_alb.c 2006-11-29
22:57:37.000000000 +0100
+++ linux/drivers/net/bonding/bond_alb.c        2007-01-16
17:23:53.000000000 +0100
@@ -1677,3 +1677,38 @@
        }
 }
 
+void bond_alb_remove_ip_from_rlb(struct bonding *bond, u32 ip) {
+       struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
+       u32 curr_index;
+
+       dprintk("%s: removing entries from rx_hashtbl for IP %lx\n",
bond->dev->name, ip);
+       _lock_rx_hashtbl(bond);
+
+       curr_index = bond_info->rx_hashtbl_head;
+       while (curr_index != RLB_NULL_INDEX) {
+               struct rlb_client_info *curr =
&(bond_info->rx_hashtbl[curr_index]);
+               u32 next_index = bond_info->rx_hashtbl[curr_index].next;
+               u32 prev_index = bond_info->rx_hashtbl[curr_index].prev;
+
+               if (curr->ip_src == ip) {
+                       dprintk("%s: entry %u matched\n", bond->dev->name,
curr_index);
+
+                       if (curr_index == bond_info->rx_hashtbl_head) {
+                               bond_info->rx_hashtbl_head = next_index;
+                       }
+                       if (prev_index != RLB_NULL_INDEX) {
+                               bond_info->rx_hashtbl[prev_index].next =
next_index;
+                       }
+                       if (next_index != RLB_NULL_INDEX) {
+                               bond_info->rx_hashtbl[next_index].prev =
prev_index;
+                       }
+
+                       rlb_init_table_entry(curr);
+               }
+
+               curr_index = next_index;
+       }
+
+       _unlock_rx_hashtbl(bond);
+}
+
diff -ur linux-2.6.19/drivers/net/bonding/bond_alb.h
linux/drivers/net/bonding/bond_alb.h
--- linux-2.6.19/drivers/net/bonding/bond_alb.h 2006-11-29
22:57:37.000000000 +0100
+++ linux/drivers/net/bonding/bond_alb.h        2007-01-16
17:23:53.000000000 +0100
@@ -128,5 +128,6 @@
 void bond_alb_monitor(struct bonding *bond);
 int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
+void bond_alb_remove_ip_from_rlb(struct bonding *bond, u32 ip);
 #endif /* __BOND_ALB_H__ */
 
diff -ur linux-2.6.19/drivers/net/bonding/bond_main.c
linux/drivers/net/bonding/bond_main.c
--- linux-2.6.19/drivers/net/bonding/bond_main.c        2006-11-29
22:57:37.000000000 +0100
+++ linux/drivers/net/bonding/bond_main.c       2007-01-16
17:30:49.000000000 +0100
@@ -3356,6 +3356,12 @@
                                return NOTIFY_OK;
                        case NETDEV_DOWN:
                                bond->master_ip =
bond_glean_dev_ip(bond->dev);
+
+                               /* remove IP from RLB hashtable if using
balance-alb mode: */
+                               if (bond->params.mode == BOND_MODE_ALB) {
+                                       bond_alb_remove_ip_from_rlb(bond,
ifa->ifa_local);
+                               }
+
                                return NOTIFY_OK;
                        default:
                                return NOTIFY_DONE;
---8<---

The function bond_alb_remove_ip_from_rlb is heavily based on the function
rlb_clear_vlan.

And here's a useful patch for debugging purposes (it outputs the rx_hashtbl
in
the proc-file of the bond):

---8<---
diff -ur linux-2.6.19/drivers/net/bonding/bond_alb.c
linux/drivers/net/bonding/bond_alb.c
--- linux-2.6.19/drivers/net/bonding/bond_alb.c 2007-01-16
18:59:32.000000000 +0100
+++ linux/drivers/net/bonding/bond_alb.c        2007-01-16
18:48:15.000000000 +0100
@@ -26,6 +26,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/pkt_sched.h>
+#include <linux/seq_file.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
@@ -1677,6 +1678,45 @@
        }
 }
 
+void bond_alb_info_show(struct seq_file *seq) {
+       struct bonding *bond = seq->private;
+       struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
+       struct rlb_client_info *rx_hash_table;
+       u32 index;
+       u32 src, dst;
+
+       seq_puts(seq, "\nALB info\n\n");
+       seq_puts(seq, "    Receive Load Balancing table:\n\n");
+       seq_puts(seq, "    Index Slave    Server          Client
Client-MAC        Asgnd\n");
+
+       _lock_rx_hashtbl(bond);
+
+       rx_hash_table = bond_info->rx_hashtbl;
+
+       if (rx_hash_table != NULL) {
+               for (index = bond_info->rx_hashtbl_head; 
+                               index != RLB_NULL_INDEX; 
+                               index = rx_hash_table[index].next) {
+                       src = ntohl(rx_hash_table[index].ip_src);
+                       dst = ntohl(rx_hash_table[index].ip_dst);
+
+                       seq_printf(seq,        
+                                       "    %03u   %8s %03u.%03u.%03u.%03u
%03u.%03u.%03u.%03u %02x:%02x:%02x:%02x:%02x:%02x %3s\n",
+                                       index,
+                                       (rx_hash_table[index].slave != NULL
? rx_hash_table[index].slave->dev->name : "none"),
+                                       ((src >> 24) & 0xff), ((src >> 16) &
0xff), ((src >> 8) & 0xff), (src & 0xff), 
+                                       ((dst >> 24) & 0xff), ((dst >> 16) &
0xff), ((dst >> 8) & 0xff), (dst & 0xff), 
+                                       rx_hash_table[index].mac_dst[0],
rx_hash_table[index].mac_dst[1],
+                                       rx_hash_table[index].mac_dst[2],
rx_hash_table[index].mac_dst[3],
+                                       rx_hash_table[index].mac_dst[4],
rx_hash_table[index].mac_dst[5],
+                                       (rx_hash_table[index].assigned ?
"yes" : "no")
+                       );
+               }
+       }       
+
+       _unlock_rx_hashtbl(bond);
+}
+
 void bond_alb_remove_ip_from_rlb(struct bonding *bond, u32 ip) {
        struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
        u32 curr_index;
diff -ur linux-2.6.19/drivers/net/bonding/bond_alb.h
linux/drivers/net/bonding/bond_alb.h
--- linux-2.6.19/drivers/net/bonding/bond_alb.h 2007-01-16
18:59:32.000000000 +0100
+++ linux/drivers/net/bonding/bond_alb.h        2007-01-16
19:01:46.000000000 +0100
@@ -128,6 +128,7 @@
 void bond_alb_monitor(struct bonding *bond);
 int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
 void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
+void bond_alb_info_show(struct seq_file *seq);
 void bond_alb_remove_ip_from_rlb(struct bonding *bond, u32 ip);
 #endif /* __BOND_ALB_H__ */
 
diff -ur linux-2.6.19/drivers/net/bonding/bond_main.c
linux/drivers/net/bonding/bond_main.c
--- linux-2.6.19/drivers/net/bonding/bond_main.c        2007-01-16
18:59:32.000000000 +0100
+++ linux/drivers/net/bonding/bond_main.c       2007-01-16
18:48:15.000000000 +0100
@@ -3048,6 +3048,10 @@
                                   ad_info.partner_system[5]);
                }
        }
+       else
+       if (bond->params.mode == BOND_MODE_ALB) {
+               bond_alb_info_show(seq);
+       }
 }
 
 static void bond_info_show_slave(struct seq_file *seq, const struct slave
*slave)
---8<---

I attach this example to visualize the bug. The box is named 'linux' (which
has
the two IPs 10.0.91.128 and 10.0.91.129) and the other machine (which makes
some traffic) is called 'dave'. Their clocks are synchronized via NTP.

---8<---
linux:~ # modprobe bonding miimon=100 updelay=200 mode=balance-alb
use_carrier=0
linux:~ # ifconfig bond0 10.0.91.128 netmask 255.255.255.0 up
linux:~ # ifenslave bond0 eth1
linux:~ # ifenslave bond0 eth2
linux:~ # ip addr add 10.0.91.129 dev bond0
linux:~ # ip addr sh bond0
18: bond0: <BROADCAST,MULTICAST,MASTER,UP> mtu 1500 qdisc noqueue 
    link/ether 00:02:b3:55:2e:b1 brd ff:ff:ff:ff:ff:ff
    inet 10.0.91.128/24 brd 10.255.255.255 scope global bond0
    inet 10.0.91.129/32 scope global bond0
    inet6 fe80::200:ff:fe00:0/64 scope link 
       valid_lft forever preferred_lft forever
---

dave:~ # ping 10.0.91.129
PING 10.0.91.129 (10.0.91.129) 56(84) bytes of data.
64 bytes from 10.0.91.129: icmp_seq=1 ttl=64 time=3.83 ms
64 bytes from 10.0.91.129: icmp_seq=2 ttl=64 time=0.205 ms
[...]
dave:~ # tcpdump -i bond0 arp host 10.0.91.129
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on bond0, link-type EN10MB (Ethernet), capture size 96 bytes
11:55:41.829735 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui Unknown)
11:55:41.830993 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui Unknown)
11:55:44.047261 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui Unknown)
11:55:44.047276 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui Unknown)
[...]

---

linux:~ # ip addr del 10.0.91.129 dev bond0
linux:~ # ip addr sh bond0
18: bond0: <BROADCAST,MULTICAST,MASTER,UP> mtu 1500 qdisc noqueue 
    link/ether 00:02:b3:55:2e:b1 brd ff:ff:ff:ff:ff:ff
    inet 10.0.91.128/24 brd 10.255.255.255 scope global bond0
    inet6 fe80::200:ff:fe00:0/64 scope link 
       valid_lft forever preferred_lft forever
linux:~ # date
Tue Jan 16 11:55:57 CET 2007

---

dave:~ # date
Tue Jan 16 11:56:59 CET 2007
dave:~ # tcpdump -i bond0 arp host 10.0.91.129
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on bond0, link-type EN10MB (Ethernet), capture size 96 bytes
11:57:04.305078 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui Unknown)
11:57:04.306248 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui Unknown)
11:57:06.704552 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui Unknown)
11:57:06.704569 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui Unknown)
[...]
---8<---


Bye
Christian Jung

PS I'm sorry but I have to use a mailer which has some handicaps. If the
whitespaces of the patches are munged in any way I can send you the patches
as
attachment.

Another thing: When shutting down a bond (e.g. ifconfig bond0 0.0.0.0 down)
the
slaves keep the master IP address of the bond. Is there a special reason for
this behaviour? 

phone: +49 6898/10-4987
fax: +49 6898/10-54987
http://www.saarstahl.de
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to