[patch 06/20] sky2: check drop truncated packets
-stable review patch. If anyone has any objections, please let us know. -- From: Stephen Hemminger <[EMAIL PROTECTED]> Backport of commit 71749531f2d1954137a1a77422ef4ff29eb102dd If packet larger than MTU is received, the driver uses hardware to truncate the packet. Use the status registers to catch/drop them. Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]> Signed-off-by: Greg Kroah-Hartman <[EMAIL PROTECTED]> --- drivers/net/sky2.c |8 1 file changed, 8 insertions(+) --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -2065,6 +2065,9 @@ static struct sk_buff *sky2_receive(stru if (!(status & GMR_FS_RX_OK)) goto resubmit; + if (status >> 16 != length) + goto len_mismatch; + if (length < copybreak) skb = receive_copy(sky2, re, length); else @@ -2074,6 +2077,11 @@ resubmit: return skb; +len_mismatch: + /* Truncation of overlength packets + causes PHY length to not match MAC length */ + ++sky2->net_stats.rx_length_errors; + error: ++sky2->net_stats.rx_errors; if (status & GMR_FS_RX_FF_OV) { -- - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 05/20] sky2: check for more work before leaving NAPI
-stable review patch. If anyone has any objections, please let us know. -- From: Stephen Hemminger <[EMAIL PROTECTED]> Backport of commit 5c11ce700f77fada15b6264417d72462da4bbb1c This patch avoids generating another IRQ if more packets arrive while in the NAPI poll routine. Before marking device as finished, it rechecks that the status ring is empty. Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]> Signed-off-by: Greg Kroah-Hartman <[EMAIL PROTECTED]> --- drivers/net/sky2.c | 35 +-- 1 file changed, 17 insertions(+), 18 deletions(-) --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -2428,8 +2428,7 @@ static void sky2_err_intr(struct sky2_hw static int sky2_poll(struct net_device *dev0, int *budget) { struct sky2_hw *hw = ((struct sky2_port *) netdev_priv(dev0))->hw; - int work_limit = min(dev0->quota, *budget); - int work_done = 0; + int work_done; u32 status = sky2_read32(hw, B0_Y2_SP_EISR); if (unlikely(status & Y2_IS_ERROR)) @@ -2441,25 +2440,25 @@ static int sky2_poll(struct net_device * if (status & Y2_IS_IRQ_PHY2) sky2_phy_intr(hw, 1); - work_done = sky2_status_intr(hw, work_limit); - if (work_done < work_limit) { - /* Bug/Errata workaround? -* Need to kick the TX irq moderation timer. -*/ - if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) { - sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP); - sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START); - } - netif_rx_complete(dev0); + work_done = sky2_status_intr(hw, min(dev0->quota, *budget)); + *budget -= work_done; + dev0->quota -= work_done; - /* end of interrupt, re-enables also acts as I/O synchronization */ - sky2_read32(hw, B0_Y2_SP_LISR); - return 0; - } else { - *budget -= work_done; - dev0->quota -= work_done; + /* More work? */ + if (hw->st_idx != sky2_read16(hw, STAT_PUT_IDX)) return 1; + + /* Bug/Errata workaround? +* Need to kick the TX irq moderation timer. +*/ + if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) { + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP); + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START); } + netif_rx_complete(dev0); + + sky2_read32(hw, B0_Y2_SP_LISR); + return 0; } static irqreturn_t sky2_intr(int irq, void *dev_id) -- - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 04/20] sky2: carrier management
-stable review patch. If anyone has any objections, please let us know. -- From: Stephen Hemminger <[EMAIL PROTECTED]> backport of commit 55d7b4e6ed6ad3ec5e5e30b3b4515a0a6a53e344 Make sky2 handle carrier similar to other drivers, eliminate some possible races in carrier state transistions. Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]> Signed-off-by: Greg Kroah-Hartman <[EMAIL PROTECTED]> --- drivers/net/sky2.c | 11 --- 1 file changed, 4 insertions(+), 7 deletions(-) --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -1234,6 +1234,8 @@ static int sky2_up(struct net_device *de if (netif_msg_ifup(sky2)) printk(KERN_INFO PFX "%s: enabling interface\n", dev->name); + netif_carrier_off(dev); + /* must be power of 2 */ sky2->tx_le = pci_alloc_consistent(hw->pdev, TX_RING_SIZE * @@ -1573,7 +1575,6 @@ static int sky2_down(struct net_device * /* Stop more packets from being queued */ netif_stop_queue(dev); - netif_carrier_off(dev); /* Disable port IRQ */ imask = sky2_read32(hw, B0_IMSK); @@ -1625,6 +1626,8 @@ static int sky2_down(struct net_device * sky2_phy_power(hw, port, 0); + netif_carrier_off(dev); + /* turn off LED's */ sky2_write16(hw, B0_Y2LED, LED_STAT_OFF); @@ -1689,7 +1692,6 @@ static void sky2_link_up(struct sky2_por gm_phy_write(hw, port, PHY_MARV_INT_MASK, PHY_M_DEF_MSK); netif_carrier_on(sky2->netdev); - netif_wake_queue(sky2->netdev); /* Turn on link LED */ sky2_write8(hw, SK_REG(port, LNK_LED_REG), @@ -1741,7 +1743,6 @@ static void sky2_link_down(struct sky2_p gma_write16(hw, port, GM_GP_CTRL, reg); netif_carrier_off(sky2->netdev); - netif_stop_queue(sky2->netdev); /* Turn on link LED */ sky2_write8(hw, SK_REG(port, LNK_LED_REG), LINKLED_OFF); @@ -3493,10 +3494,6 @@ static __devinit struct net_device *sky2 memcpy_fromio(dev->dev_addr, hw->regs + B2_MAC_1 + port * 8, ETH_ALEN); memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); - /* device is off until link detection */ - netif_carrier_off(dev); - netif_stop_queue(dev); - return dev; } -- - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 03/20] sky2: restore workarounds for lost interrupts
-stable review patch. If anyone has any objections, please let us know. -- From: Stephen Hemminger <[EMAIL PROTECTED]> Backport of commit c59697e06058fc2361da8cefcfa3de85ac107582 This patch restores a couple of workarounds from 2.6.16: * restart transmit moderation timer in case it expires during IRQ routine * default to having 10 HZ watchdog timer. At this point it more important not to hang than to worry about the power cost. Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]> Signed-off-by: Greg Kroah-Hartman <[EMAIL PROTECTED]> --- drivers/net/sky2.c |9 - 1 file changed, 8 insertions(+), 1 deletion(-) --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -96,7 +96,7 @@ static int disable_msi = 0; module_param(disable_msi, int, 0); MODULE_PARM_DESC(disable_msi, "Disable Message Signaled Interrupt (MSI)"); -static int idle_timeout = 0; +static int idle_timeout = 100; module_param(idle_timeout, int, 0); MODULE_PARM_DESC(idle_timeout, "Watchdog timer for lost interrupts (ms)"); @@ -2442,6 +2442,13 @@ static int sky2_poll(struct net_device * work_done = sky2_status_intr(hw, work_limit); if (work_done < work_limit) { + /* Bug/Errata workaround? +* Need to kick the TX irq moderation timer. +*/ + if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) { + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP); + sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START); + } netif_rx_complete(dev0); /* end of interrupt, re-enables also acts as I/O synchronization */ -- - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [NET]: Share correct feature code between bridging and bonding
Hi: Here's the back-port for 2.6.22. [NET]: Share correct feature code between bridging and bonding http://bugzilla.kernel.org/show_bug.cgi?id=8797 shows that the bonding driver may produce bogus combinations of the checksum flags and SG/TSO. For example, if you bond devices with NETIF_F_HW_CSUM and NETIF_F_IP_CSUM you'll end up with a bonding device that has neither flag set. If both have TSO then this produces an illegal combination. The bridge device on the other hand has the correct code to deal with this. In fact, the same code can be used for both. So this patch moves that logic into net/core/dev.c and uses it for both bonding and bridging. In the process I've made small adjustments such as only setting GSO_ROBUST if at least one constituent device supports it. Signed-off-by: Herbert Xu <[EMAIL PROTECTED]> Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 6287ffb..0af7bc8 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1233,43 +1233,31 @@ int bond_sethwaddr(struct net_device *bond_dev, struct net_device *slave_dev) return 0; } -#define BOND_INTERSECT_FEATURES \ - (NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_TSO | NETIF_F_UFO) +#define BOND_VLAN_FEATURES \ + (NETIF_F_VLAN_CHALLENGED | NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_TX | \ +NETIF_F_HW_VLAN_FILTER) /* * Compute the common dev->feature set available to all slaves. Some - * feature bits are managed elsewhere, so preserve feature bits set on - * master device that are not part of the examined set. + * feature bits are managed elsewhere, so preserve those feature bits + * on the master device. */ static int bond_compute_features(struct bonding *bond) { - unsigned long features = BOND_INTERSECT_FEATURES; struct slave *slave; struct net_device *bond_dev = bond->dev; + unsigned long features = bond_dev->features & ~BOND_VLAN_FEATURES; unsigned short max_hard_header_len = ETH_HLEN; int i; bond_for_each_slave(bond, slave, i) { - features &= (slave->dev->features & BOND_INTERSECT_FEATURES); + features = netdev_compute_features(features, + slave->dev->features); if (slave->dev->hard_header_len > max_hard_header_len) max_hard_header_len = slave->dev->hard_header_len; } - if ((features & NETIF_F_SG) && - !(features & NETIF_F_ALL_CSUM)) - features &= ~NETIF_F_SG; - - /* -* features will include NETIF_F_TSO (NETIF_F_UFO) iff all -* slave devices support NETIF_F_TSO (NETIF_F_UFO), which -* implies that all slaves also support scatter-gather -* (NETIF_F_SG), which implies that features also includes -* NETIF_F_SG. So no need to check whether we have an -* illegal combination of NETIF_F_{TSO,UFO} and -* !NETIF_F_SG -*/ - - features |= (bond_dev->features & ~BOND_INTERSECT_FEATURES); + features |= (bond_dev->features & BOND_VLAN_FEATURES); bond_dev->features = features; bond_dev->hard_header_len = max_hard_header_len; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3a70f55..ab210be 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1032,6 +1032,8 @@ extern void dev_seq_stop(struct seq_file *seq, void *v); extern void linkwatch_run_queue(void); +extern int netdev_compute_features(unsigned long all, unsigned long one); + static inline int net_gso_ok(int features, int gso_type) { int feature = gso_type << NETIF_F_GSO_SHIFT; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 5e1892d..c326602 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -179,5 +179,6 @@ void br_dev_setup(struct net_device *dev) dev->priv_flags = IFF_EBRIDGE; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | - NETIF_F_TSO | NETIF_F_NO_CSUM | NETIF_F_GSO_ROBUST; + NETIF_F_GSO_SOFTWARE | NETIF_F_NO_CSUM | + NETIF_F_GSO_ROBUST | NETIF_F_LLTX; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 849deaf..fefd7c1 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -360,35 +360,15 @@ int br_min_mtu(const struct net_bridge *br) void br_features_recompute(struct net_bridge *br) { struct net_bridge_port *p; - unsigned long features, checksum; + unsigned long features; - checksum = br->feature_mask & NETIF_F_ALL_CSUM ? NETIF_F_NO_CSUM : 0; - features = br->feature_mask & ~NETIF_F_ALL_CSUM; + features = br->feature_
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Mon, 20 Aug 2007, Chris Snook wrote: > > What about barrier removal? With consistent semantics we could optimize a > fair amount of code. Whether or not that constitutes "premature" optimization > is open to debate, but there's no question we could reduce our register wiping > in some places. Why do people think that barriers are expensive? They really aren't. Especially the regular compiler barrier is basically zero cost. Any reasonable compiler will just flush the stuff it holds in registers that isn't already automatic local variables, and for regular kernel code, that tends to basically be nothing at all. Ie a "barrier()" is likely _cheaper_ than the code generation downside from using "volatile". Linus - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCPportsfrom the host TCP port space.
> -Original Message- > From: Patrick Geoffray [mailto:[EMAIL PROTECTED] > Sent: Monday, August 20, 2007 1:34 PM > To: Felix Marti > Cc: Evgeniy Polyakov; David Miller; [EMAIL PROTECTED]; > netdev@vger.kernel.org; [EMAIL PROTECTED]; > [EMAIL PROTECTED]; [EMAIL PROTECTED]; > [EMAIL PROTECTED] > Subject: Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate > PS_TCPportsfrom the host TCP port space. > > Felix Marti wrote: > > Yes, the app will take the cache hits when accessing the data. > However, > > the fact remains that if there is a copy in the receive path, you > > require and additional 3x memory BW (which is very significant at > these > > high rates and most likely the bottleneck for most current > systems)... > > and somebody always has to take the cache miss be it the copy_to_user > or > > the app. > > The cache miss is going to cost you half the memory bandwidth of a full > copy. If the data is already in cache, then the copy is cheaper. > > However, removing the copy removes the kernel from the picture on the > receive side, so you lose demultiplexing, asynchronism, security, > accounting, flow-control, swapping, etc. If it's ok with you to not use > the kernel stack, then why expect to fit in the existing infrastructure > anyway ? Many of the things you're referring to are moved to the offload adapter but from an ease of use point of view, it would be great if the user could still collect stats the same way, i.e. netstat reports the 4-tuple in use and other network stats. In addition, security features and packet scheduling could be integrated so that the user configures them the same way as the network stack. > > > Yes, RDMA support is there... but we could make it better and easier > to > > What do you need from the kernel for RDMA support beyond HW drivers ? A > fast way to pin and translate user memory (ie registration). That is > pretty much the sandbox that David referred to. > > Eventually, it would be useful to be able to track the VM space to > implement a registration cache instead of using ugly hacks in user- > space > to hijack malloc, but this is completely independent from the net > stack. > > > use. We have a problem today with port sharing and there was a > proposal > > The port spaces are either totally separate and there is no issue, or > completely identical and you should then run your connection manager in > user-space or fix your middlewares. When running on an iWarp device (and hence on top of TCP) I believe that the port space should shared and i.e. netstat reports the 4-tuple in use. > > > and not for technical reasons. I believe this email threads shows in > > detail how RDMA (a network technology) is treated as bastard child by > > the network folks, well at least by one of them. > > I don't think it's fair. This thread actually show how pushy some RDMA > folks are about not acknowledging that the current infrastructure is > here for a reason, and about mistaking zero-copy and RDMA. Zero-copy and RDMA are not the same but in the context of this discussion I referred to RDMA as a superset (zero-copy is implied). > > This is a similar argument than the TOE discussion, and it was > definitively a good decision to not mess up the Linux stack with TOEs. > > Patrick - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/4 - rev2] Add new timeval_to_sec function
Stephen Hemminger wrote: > On Mon, 20 Aug 2007 13:45:36 +0530 > Varun Chandramohan <[EMAIL PROTECTED]> wrote: > > >> A new function for converting timeval to time_t is added in time.h. Its a >> common function used in different >> places. >> >> Signed-off-by: Varun Chandramohan <[EMAIL PROTECTED]> >> --- >> include/linux/time.h | 12 >> 1 files changed, 12 insertions(+), 0 deletions(-) >> >> diff --git a/include/linux/time.h b/include/linux/time.h >> index 6a5f503..1faf65c 100644 >> --- a/include/linux/time.h >> +++ b/include/linux/time.h >> @@ -149,6 +149,18 @@ static inline s64 timeval_to_ns(const st >> } >> >> /** >> + * timeval_to_sec - Convert timeval to seconds >> + * @tv: pointer to the timeval variable to be converted >> + * >> + * Returns the seconds representation of timeval parameter. >> + * Note : Here we round up the value. We dont need accuracy. >> + */ >> +static inline time_t timeval_to_sec(const struct timeval *tv) >> +{ >> +return (tv->tv_sec + (tv->tv_usec ? 1 : 0)); >> +} >> + >> > > Why roundup? Unless there is a requirement in the standard, please just > use the timeval seconds. In which case the inline is unneeded. > > > Thanks for the reply stephen. As you might be aware that this discussion took place sometime ago when i posted my first patch set. Initially it was like this: return (tv->tv_sec + (tv->tv_usec + 50)/100); Then i got some comments from patrick and oliver. They wanted me to round it up. So what about rounding up with return (tv->tv_sec + (tv->tv_usec + 99)/100); Then on second revision the above was changed to return tv->tv_sec + (tv->tv_usec ? 1 : 0); as it would be much faster. Since the timeval is meant for stats purpose we decided not really bother about accuracy. My initial patch actually took only sec value into account, but i was adviced to round up usec to give a better o/p. Is that ok??? Or you still think we should consider only secs? Regards, Varun - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCP ports from the host TCP port space.
[TSO / LRO discussion snipped -- it's not the main point so no sense spending energy arguing about it] > Just be realistic and accept that RDMA is a point in time solution, > and like any other such technology takes flexibility away from users. > > Horizontal scaling of cpus up to huge arity cores, network devices > using large numbers of transmit and receive queues and classification > based queue selection, are all going to work to make things like RDMA > even more irrelevant than they already are. To me there is a real fundamental difference between RDMA and traditional SOCK_STREAM / SOCK_DATAGRAM networking, namely that messages can carry the address where they're supposed to be delivered (what the IETF calls "direct data placement"). And on top of that you can build one-sided operations aka put/get aka RDMA. And direct data placement really does give you a factor of two at least, because otherwise you're stuck receiving the data in one buffer, looking at some of the data at least, and then figuring out where to copy it. And memory bandwidth is if anything becoming more valuable; maybe LRO + header splitting + page remapping tricks can get you somewhere but as NCPUS grows then it seems the TLB shootdown cost of page flipping is only going to get worse. Don't get too hung up on the fact that current iWARP (RDMA over IP) implementations are using TCP offload -- to me that is just a side effect of doing enough processing on the NIC side of the PCI bus to be able to do direct data placement. InfiniBand with competely different transport, link and physical layers is one way to implement RDMA without TCP offload and I'm sure there will be others -- eg Intel's IOAT stuff could probably evolve to the point where you could implement iWARP with software TCP and the data placement offloaded to some DMA engine. - R. - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Tue, Aug 21, 2007 at 01:02:01AM +0200, Segher Boessenkool wrote: > >>And no, RMW on MMIO isn't "problematic" at all, either. > >> > >>An RMW op is a read op, a modify op, and a write op, all rolled > >>into one opcode. But three actual operations. > > > >Maybe for some CPUs, but not all. ARM for instance can't use the > >load exclusive and store exclusive instructions to MMIO space. > > Sure, your CPU doesn't have RMW instructions -- how to emulate > those if you don't have them is a totally different thing. I thought that ARM's load exclusive and store exclusive instructions were its equivalent of LL and SC, which RISC machines typically use to build atomic sequences of instructions -- and which normally cannot be applied to MMIO space. Thanx, Paul - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] DM9000: fix interface hang under load
When transferring data at full speed, the DM9000 network interface sometimes stops sending/receiving data. Worse, ksoftirqd consumes 100% cpu and the net tx watchdog never triggers. Fix by spin_lock_irqsave() in dm9000_start_xmit() to prevent the interrupt handler from interfering. Signed-off-by: Florian Westphal <[EMAIL PROTECTED]> --- Actually the comments ('Disable all interrupts, iow(db, DM9000_IMR, IMR_PAR) etc) give the impression that the interrupt handler cannot run during dm9000_start_xmit(), however this isn't correct (perhaps the chipset has some weird timing issues?). The interface lockup usually occurs between 30 and 360 seconds after starting transmitting data (netcat /dev/zero) at full speed; with this patch applied I haven't been able to reproduce hangs yet (ran for > 2h). FTR: This is a dm9000 on XScale-PXA255 rev 6 (ARMv5TE)/Compulab CM-x255, i.e. a module not supported by the vanilla kernel. Tested on (patched) 2.6.18. dm9000.c | 25 +++-- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/drivers/net/dm9000.c b/drivers/net/dm9000.c index c3de81b..738aa59 100644 --- a/drivers/net/dm9000.c +++ b/drivers/net/dm9000.c @@ -700,6 +700,7 @@ dm9000_init_dm9000(struct net_device *dev) static int dm9000_start_xmit(struct sk_buff *skb, struct net_device *dev) { + unsigned long flags; board_info_t *db = (board_info_t *) dev->priv; PRINTK3("dm9000_start_xmit\n"); @@ -707,10 +708,7 @@ dm9000_start_xmit(struct sk_buff *skb, struct net_device *dev) if (db->tx_pkt_cnt > 1) return 1; - netif_stop_queue(dev); - - /* Disable all interrupts */ - iow(db, DM9000_IMR, IMR_PAR); + spin_lock_irqsave(&db->lock, flags); /* Move data to DM9000 TX RAM */ writeb(DM9000_MWCMD, db->io_addr); @@ -718,12 +716,9 @@ dm9000_start_xmit(struct sk_buff *skb, struct net_device *dev) (db->outblk)(db->io_data, skb->data, skb->len); db->stats.tx_bytes += skb->len; + db->tx_pkt_cnt++; /* TX control: First packet immediately send, second packet queue */ - if (db->tx_pkt_cnt == 0) { - - /* First Packet */ - db->tx_pkt_cnt++; - + if (db->tx_pkt_cnt == 1) { /* Set TX length to DM9000 */ iow(db, DM9000_TXPLL, skb->len & 0xff); iow(db, DM9000_TXPLH, (skb->len >> 8) & 0xff); @@ -732,23 +727,17 @@ dm9000_start_xmit(struct sk_buff *skb, struct net_device *dev) iow(db, DM9000_TCR, TCR_TXREQ); /* Cleared after TX complete */ dev->trans_start = jiffies; /* save the time stamp */ - } else { /* Second packet */ - db->tx_pkt_cnt++; db->queue_pkt_len = skb->len; + netif_stop_queue(dev); } + spin_unlock_irqrestore(&db->lock, flags); + /* free this SKB */ dev_kfree_skb(skb); - /* Re-enable resource check */ - if (db->tx_pkt_cnt == 1) - netif_wake_queue(dev); - - /* Re-enable interrupt */ - iow(db, DM9000_IMR, IMR_PAR | IMR_PTM | IMR_PRM); - return 0; } - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] spidernet: enable poll() before registering interrupts
On Tuesday 21 August 2007, Linas Vepstas wrote: > > An intervening patch changed the init so that the > hardware interrupts aren't enabled until after the > request_irq, and after the poll_enable(). Thus, > it seems this pach is no longer needed, right? Right, the other patch that you already applied is a better fix. Arnd <>< - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
And no, RMW on MMIO isn't "problematic" at all, either. An RMW op is a read op, a modify op, and a write op, all rolled into one opcode. But three actual operations. Maybe for some CPUs, but not all. ARM for instance can't use the load exclusive and store exclusive instructions to MMIO space. Sure, your CPU doesn't have RMW instructions -- how to emulate those if you don't have them is a totally different thing. Segher - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Tue, Aug 21, 2007 at 12:04:17AM +0200, Segher Boessenkool wrote: > And no, RMW on MMIO isn't "problematic" at all, either. > > An RMW op is a read op, a modify op, and a write op, all rolled > into one opcode. But three actual operations. Maybe for some CPUs, but not all. ARM for instance can't use the load exclusive and store exclusive instructions to MMIO space. This means placing atomic_t or bitops into MMIO space is a definite no-go on ARM. It breaks. -- Russell King Linux kernel2.6 ARM Linux - http://www.arm.linux.org.uk/ maintainer of: - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] spidernet: fix interrupt reason recognition
On Mon, Aug 20, 2007 at 10:13:27PM +0900, Ishizaki Kou wrote: > Please apply this to 2.6.23. I'll review and forward shortly. Kick me if you don't see a formal reply in a few days. > And also, please apply the following Arnd-san's patch to fix a problem > that spidernet driver sometimes causes a BUG_ON at open. > > http://patchwork.ozlabs.org/cbe-oss-dev/patch?id=12211 Are you sure? This patch no longer applies cleanly, in part because your patch "[PATCH] spidernet: improve interrupt handling" from Mon, 09 Jul 2007 added a spider_net_enable_interrupts(card); at the end of spider_net_open(). Because of this, it seems like Arnd's patch is no longer needed, right? --linas - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] spidernet: enable poll() before registering interrupts
On Thu, Jul 12, 2007 at 01:19:11AM +0200, Arnd Bergmann wrote: > We must not call netif_poll_enable after enabling interrupts, > because an interrupt might come in and set the __LINK_STATE_RX_SCHED > bit before we get to clear that bit again. If that happens, > the next call to the ->poll() function will oops. > > Signed-off-by: Arnd Bergmann <[EMAIL PROTECTED]> > --- > This was found during testing with the fedora kernel, > with all patches from netdev-2.6.git applied. > > It may not be the right fix, but this is currently the > only way I can get that kernel to boot. > > One part I don't understand at the moment is that Christian > Krafft reported the same problem with tg3, but that driver > has all interrupts disabled at the device while calling > the request_irq() function, which seems to be the best > solution for avoiding the bug in the first place. It apears that this patch does not apply cleanly any more, and I think that's a good thing! An intervening patch changed the init so that the hardware interrupts aren't enabled until after the request_irq, and after the poll_enable(). Thus, it seems this pach is no longer needed, right? I'll pursue with Kou Ishizaki, who pointed out that I'd missed your email. --linas - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] spidernet: enable poll() before registering interrupts
On Thu, Jul 12, 2007 at 01:19:11AM +0200, Arnd Bergmann wrote: > Index: linux-2.6/drivers/net/spider_net.c Sorry, this one got lost in my mailbox. Will attend to it shortly. --linas - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Such code generally doesn't care precisely when it gets the update, just that the update is atomic, and it doesn't loop forever. Yes, it _does_ care that it gets the update _at all_, and preferably as early as possible. Regardless, I'm convinced we just need to do it all in assembly. So do you want "volatile asm" or "plain asm", for atomic_read()? The asm version has two ways to go about it too... Segher - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Right. ROTFL... volatile actually breaks atomic_t instead of making it safe. x++ becomes a register load, increment and a register store. Without volatile we can increment the memory directly. It seems that volatile requires that the variable is loaded into a register first and then operated upon. Understandable when you think about volatile being used to access memory mapped I/O registers where a RMW operation could be problematic. So, if we want consistent behavior, we're pretty much screwed unless we use inline assembler everywhere? Nah, this whole argument is flawed -- "without volatile" we still *cannot* "increment the memory directly". On x86, you need a lock prefix; on other archs, some other mechanism to make the memory increment an *atomic* memory increment. And no, RMW on MMIO isn't "problematic" at all, either. An RMW op is a read op, a modify op, and a write op, all rolled into one opcode. But three actual operations. The advantages of asm code for atomic_{read,set} are: 1) all the other atomic ops are implemented that way already; 2) you have full control over the asm insns selected, in particular, you can guarantee you *do* get an atomic op; 3) you don't need to use "volatile " which generates not-all-that-good code on archs like x86, and we want to get rid of it anyway since it is problematic in many ways; 4) you don't need to use *(volatile *)&, which a) doesn't exist in C; b) isn't documented or supported in GCC; c) has a recent history of bugginess; d) _still uses volatile objects_; e) _still_ is problematic in almost all those same ways as in 3); 5) you can mix atomic and non-atomic accesses to the atomic_t, which you cannot with the other alternatives. The only disadvantage I know of is potentially slightly worse instruction scheduling. This is a generic asm() problem: GCC cannot see what actual insns are inside the asm() block. Segher - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCPportsfrom the host TCP port space.
Felix Marti wrote: Yes, the app will take the cache hits when accessing the data. However, the fact remains that if there is a copy in the receive path, you require and additional 3x memory BW (which is very significant at these high rates and most likely the bottleneck for most current systems)... and somebody always has to take the cache miss be it the copy_to_user or the app. The cache miss is going to cost you half the memory bandwidth of a full copy. If the data is already in cache, then the copy is cheaper. However, removing the copy removes the kernel from the picture on the receive side, so you lose demultiplexing, asynchronism, security, accounting, flow-control, swapping, etc. If it's ok with you to not use the kernel stack, then why expect to fit in the existing infrastructure anyway ? Yes, RDMA support is there... but we could make it better and easier to What do you need from the kernel for RDMA support beyond HW drivers ? A fast way to pin and translate user memory (ie registration). That is pretty much the sandbox that David referred to. Eventually, it would be useful to be able to track the VM space to implement a registration cache instead of using ugly hacks in user-space to hijack malloc, but this is completely independent from the net stack. use. We have a problem today with port sharing and there was a proposal The port spaces are either totally separate and there is no issue, or completely identical and you should then run your connection manager in user-space or fix your middlewares. and not for technical reasons. I believe this email threads shows in detail how RDMA (a network technology) is treated as bastard child by the network folks, well at least by one of them. I don't think it's fair. This thread actually show how pushy some RDMA folks are about not acknowledging that the current infrastructure is here for a reason, and about mistaking zero-copy and RDMA. This is a similar argument than the TOE discussion, and it was definitively a good decision to not mess up the Linux stack with TOEs. Patrick - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCPportsfrom the host TCP port space.
> GPUs have almost no influence on system security, Unless you use direct rendering from user space. -Andi - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/2] qla3xxx: receive path bugfixes.
The following two patches fix: An undocumented "feature" where the 4032 chip sets bit-7 of the opcode for an inbound completion if it's for a VLAN. The access of stale data on a completion entry. These patches were built and tested on 2.6.23-rc1. Signed-off-by: Ron Mercer <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCPportsfrom the host TCP port space.
* Felix Marti <[EMAIL PROTECTED]> 2007-08-20 12:02 > These graphic adapters provide a wealth of features that you can take > advantage of to bring these amazing graphics to life. General purpose > CPUs cannot keep up. Chelsio offload devices do the same thing in the > realm of networking. - Will there be things you can't do, probably yes, > but as I said, there are lots of knobs to turn (and the latest and > greatest feature that gets hyped up might not always be the best thing > since sliced bread anyway; what happened to BIC love? ;) GPUs have almost no influence on system security, the network stack OTOH is probably the most vulnerable part of an operating system. Even if all vendors would implement all the features collected over the last years properly which seems unlikely. Having such an essential and critical part depend on the vendor of my network card without being able to even verify it properly is truly frightening. - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] phy layer: fix genphy_setup_forced (don't reset)
On Aug 17, 2007, at 01:54, Domen Puncer wrote: Writing BMCR_RESET bit will reset MII_BMCR to default values. This is clearly not what we want. Signed-off-by: Domen Puncer <[EMAIL PROTECTED]> Acked-by: Andy Fleming <[EMAIL PROTECTED]> I could have sworn there was a patch that did this, already, but it must have lost steam somewhere. Andy - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCPportsfrom the host TCP port space.
Andi Kleen wrote: TSO is beneficial for the software again. The linux code currently takes several locks and does quite a few function calls for each packet and using larger packets lowers this overhead. At least with 10GbE saving CPU cycles is still quite important. Some quick netperf TCP_RR tests between a pair of dual-core rx6600's running 2.6.23-rc3. the NICs are dual-core e1000's connected back-to-back with the interrupt throttle disabled. I like using TCP_RR to tickle path-length questions because it rarely runs into bandwidth limitations regardless of the link-type. First, with TSO enabled on both sides, then with it disabled, netperf/netserver bound to the same CPU as takes interrupts, which is the "best" place to be for a TCP_RR test (although not always for a TCP_STREAM test...): :~# netperf -T 1 -t TCP_RR -H 192.168.2.105 -I 99,1 -c -C TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.2.105 (192.168.2.105) port 0 AF_INET : +/-0.5% @ 99% conf. : first burst 0 : cpu bind !!! WARNING !!! Desired confidence was not achieved within the specified iterations. !!! This implies that there was variability in the test environment that !!! must be investigated before going further. !!! Confidence intervals: Throughput : 0.3% !!! Local CPU util : 39.3% !!! Remote CPU util : 40.6% Local /Remote Socket Size Request Resp. Elapsed Trans. CPUCPUS.dem S.dem Send Recv SizeSize TimeRate local remote local remote bytes bytes bytes bytes secs. per sec % S% Sus/Tr us/Tr 16384 87380 1 1 10.01 18611.32 20.96 22.35 22.522 24.017 16384 87380 :~# ethtool -K eth2 tso off e1000: eth2: e1000_set_tso: TSO is Disabled :~# netperf -T 1 -t TCP_RR -H 192.168.2.105 -I 99,1 -c -C TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.2.105 (192.168.2.105) port 0 AF_INET : +/-0.5% @ 99% conf. : first burst 0 : cpu bind !!! WARNING !!! Desired confidence was not achieved within the specified iterations. !!! This implies that there was variability in the test environment that !!! must be investigated before going further. !!! Confidence intervals: Throughput : 0.4% !!! Local CPU util : 21.0% !!! Remote CPU util : 25.2% Local /Remote Socket Size Request Resp. Elapsed Trans. CPUCPUS.dem S.dem Send Recv SizeSize TimeRate local remote local remote bytes bytes bytes bytes secs. per sec % S% Sus/Tr us/Tr 16384 87380 1 1 10.01 19812.51 17.81 17.19 17.983 17.358 16384 87380 While the confidence intervals for CPU util weren't hit, I suspect the differences in service demand were still real. On throughput we are talking about +/- 0.2%, for CPU util we are talking about +/- 20% (percent not percentage points) in the first test and 12.5% in the second. So, in broad handwaving terms, TSO increased the per-transaction service demand by something along the lines of (23.27 - 17.67)/17.67 or ~30% and the transaction rate decreased by ~6%. rick jones bitrate blindless is a constant concern - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCPportsfrom the host TCP port space.
> -Original Message- > From: [EMAIL PROTECTED] [mailto:[EMAIL PROTECTED] On Behalf Of Andi Kleen > Sent: Monday, August 20, 2007 11:11 AM > To: Felix Marti > Cc: Evgeniy Polyakov; [EMAIL PROTECTED]; netdev@vger.kernel.org; > [EMAIL PROTECTED]; [EMAIL PROTECTED]; > [EMAIL PROTECTED]; David Miller > Subject: Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate > PS_TCPportsfrom the host TCP port space. > > "Felix Marti" <[EMAIL PROTECTED]> writes: > > > What I was referring to is that TSO(/LRO) have their own > > issues, some eluded to by Roland and me. In fact, customers working > on > > the LSR couldn't use TSO due to the burstiness it introduces > > That was in old kernels where TSO didn't honor the initial cwnd > correctly, > right? I assume it's long fixed. > > If not please clarify what the problem was. The problem is that is that Ethernet is about the only technology that discloses 'useable' throughput while everybody else talks about signaling rates ;) - OC-192 can carry about 9.128Gbps (or close to that number) and hence 10Gbps Ethernet was overwhelming the OC-192 network. The customer needed to schedule packets at about 98% of OC-192 throughput in order to avoid packet drop. The scheduling needed to be done on a per packet basis and not per 'burst of packets' basis in order to avoid packet drop. > > > have a look at graphics. > > Graphics used to be done by the host CPU and now we have dedicated > > graphics adapters that do a much better job... > > Is your off load device as programable as a modern GPU? It has a lot of knobs to turn. > > > farfetched that offload devices can do a better job at a data-flow > > problem? > > One big difference is that there is no potentially adverse and > always varying internet between the graphics card and your monitor. These graphic adapters provide a wealth of features that you can take advantage of to bring these amazing graphics to life. General purpose CPUs cannot keep up. Chelsio offload devices do the same thing in the realm of networking. - Will there be things you can't do, probably yes, but as I said, there are lots of knobs to turn (and the latest and greatest feature that gets hyped up might not always be the best thing since sliced bread anyway; what happened to BIC love? ;) > > -Andi - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFT] r8169 changes against 2.6.23-rc3
Francois Romieu wrote: The latest serie of r8169 changes is available against 2.6.23-rc3 as: http://www.fr.zoreil.com/people/francois/misc/20070818-2.6.23-rc3-r8169-test.patch or (tarball sits one level higher): http://www.fr.zoreil.com/linux/kernel/2.6.x/2.6.23-rc3/r8169-20070818/ or (rebase prone branch) git://electric-eye.fr.zoreil.com/home/romieu/linux-2.6.git#r8169 Please do not clone your whole git kernel tree from here, thanks. Changes (most recent first): - eeprom read support - phy init cleanup - PHY init for the 8168 - make room for more PHY init changes - remove dead wood - add MAC identifiers - MSI support - correct phy parameters for the 8110SC The first patch of the serie ("correct phy parameters for the 8110SC") has been elaborated with Edward Hsu from Realtek and it should help some owners of 8169 chipsets. If there is no report of regression for it on any chispet and it is reported to fix someone's problems, I will send it to Jeff Garzik for inclusion in 2.6.23 as a bugfix. Anything else in this serie has not been tested on a wide scale nor acked by the manufacturer: I consider it post 2.6.23 material. That being said, the MSI changes seem fine and the "PHY init for the 8168" patch could make a difference for the users of the 8168 whose link is not properly negotiated. Success and failure reports or patches will be welcome. Please Cc: netdev and include "r8169" in the Subject. Tested 2.6.23-rc3 plus your patch on my dual-R8169 mini-ITX Jetway J7F4K1G2E mainboard. No problems to report. begin:vcard fn:Chuck Lever n:Lever;Chuck org:Oracle Corporation;Corporate Architecture: Linux Projects Group adr:;;1015 Granger Avenue;Ann Arbor;MI;48104;USA title:Principal Member of Staff tel;work:+1 248 614 5091 x-mozilla-html:FALSE url:http://oss.oracle.com/~cel version:2.1 end:vcard
Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCPportsfrom the host TCP port space.
"Felix Marti" <[EMAIL PROTECTED]> writes: > What I was referring to is that TSO(/LRO) have their own > issues, some eluded to by Roland and me. In fact, customers working on > the LSR couldn't use TSO due to the burstiness it introduces That was in old kernels where TSO didn't honor the initial cwnd correctly, right? I assume it's long fixed. If not please clarify what the problem was. > have a look at graphics. > Graphics used to be done by the host CPU and now we have dedicated > graphics adapters that do a much better job... Is your off load device as programable as a modern GPU? > farfetched that offload devices can do a better job at a data-flow > problem? One big difference is that there is no potentially adverse and always varying internet between the graphics card and your monitor. -Andi - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.23-rc3 and SKY2 driver issue
--- Stephen Hemminger <[EMAIL PROTECTED]> wrote: > On Mon, 20 Aug 2007 09:23:46 -0700 (PDT) > James Corey <[EMAIL PROTECTED]> wrote: > > > > > --- Stephen Hemminger > > <[EMAIL PROTECTED]> wrote: > > > > > On Mon, 20 Aug 2007 08:42:21 -0700 (PDT) > > > James Corey <[EMAIL PROTECTED]> wrote: > > > > > > > > > > > --- Stephen Hemminger > > > > <[EMAIL PROTECTED]> wrote: > > > > > > > > > On Thu, 16 Aug 2007 10:25:45 +0200 > > > > > "Michal Piotrowski" > > > > > > > > > Please reproduce with a more recent kernel? > > > > > > > > Um, I thought 2.6.23rc WAS pretty recent. :-) > > > > > > > > I'll check if there is something newer in the > > > > repository now. > > > > > > > > > > What is the chip version? Please send console > log: > > > "dmesg | grep sky2" > > > > > > > > > -- > > > Stephen Hemminger > <[EMAIL PROTECTED]> > > > > > > > > > Ah ... details. > > > > Machine: > > > > Dell Optiplex 745 > > > > Kernel: > > > > 2.6.23-rc3 #1 SMP Tue Aug 14 19:44:07 EDT 2007 > x86_64 > > x86_64 x86_64 GNU/Linux > > > > Card: > > > > D-Link DGE-550SX > > > > # dmesg | grep sky2 > > sky2 :04:00.0: v1.16 addr 0xdf9fc000 irq 16 > > Yukon-XL (0xb3) rev 3 > > sky2 eth1: addr 00:17:9a:73:87:60 > > sky2 eth1: enabling interface > > sky2 eth1: ram buffer 96K > > sky2 eth1: Link is up at 1000 Mbps, full duplex, > flow > > control none > > Okay, this is a fiber based card. Does the error > happen right away (ie all packets > have bad sum), or is it sporadic (ie some magic > packet or race in hardware). > Also are you using regular (1500) or jumbo (9000) > mtu? > Regular MTU - 1500. It runs fine for normal stuff, ssh etc, but a big sftp will break it every time. My usual test is to sftp a linux distro ISO from another machine to it. Need a vacation? Get great deals to amazing places on Yahoo! Travel. http://travel.yahoo.com/ - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: skb_pull_rcsum - Fatal exception in interrupt
On Mon, 20 Aug 2007 09:21:54 -0700, "Brandeburg, Jesse" <[EMAIL PROTECTED]> said: > Hi Alan, I work on the team that supports e1000, I'd be interested > in seeing the dmesg output from the machine before it crashes, maybe > you can add that to your web collection of data below? Don't worry - it's definitely not an e1000 problem. I'm in contact with the netdev guys, who have produced a patch. Thanks anyway Alan. -- Alan J. Wylie http://www.wylie.me.uk/ "Perfection [in design] is achieved not when there is nothing left to add, but rather when there is nothing left to take away." -- Antoine de Saint-Exupery - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCPportsfrom the host TCP port space.
> -Original Message- > From: Evgeniy Polyakov [mailto:[EMAIL PROTECTED] > Sent: Monday, August 20, 2007 2:43 AM > To: Felix Marti > Cc: David Miller; [EMAIL PROTECTED]; netdev@vger.kernel.org; > [EMAIL PROTECTED]; [EMAIL PROTECTED]; linux- > [EMAIL PROTECTED]; [EMAIL PROTECTED] > Subject: Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate > PS_TCPportsfrom the host TCP port space. > > On Sun, Aug 19, 2007 at 05:47:59PM -0700, Felix Marti > ([EMAIL PROTECTED]) wrote: > > [Felix Marti] David and Herbert, so you agree that the user<>kernel > > space memory copy overhead is a significant overhead and we want to > > enable zero-copy in both the receive and transmit path? - Yes, copy > > It depends. If you need to access that data after received, you will > get > cache miss and performance will not be much better (if any) that with > copy. Yes, the app will take the cache hits when accessing the data. However, the fact remains that if there is a copy in the receive path, you require and additional 3x memory BW (which is very significant at these high rates and most likely the bottleneck for most current systems)... and somebody always has to take the cache miss be it the copy_to_user or the app. > > > avoidance is mainly an API issue and unfortunately the so widely used > > (synchronous) sockets API doesn't make copy avoidance easy, which is > one > > area where protocol offload can help. Yes, some apps can resort to > > sendfile() but there are many apps which seem to have trouble > switching > > to that API... and what about the receive path? > > There is number of implementations, and all they are suitable for is > to have recvfile(), since this is likely the only case, which can work > without cache. > > And actually RDMA stack exist and no one said it should be thrown away > _until_ it messes with main stack. It started to speal ports. What will > happen when it gest all port space and no new legal network conection > can be opened, although there is no way to show to user who got it? > What will happen if hardware RDMA connection got terminated and > software > could not free the port? Will RDMA request to export connection reset > functions out of stack to drop network connections which are on the > ports > which are supposed to be used by new RDMA connections? Yes, RDMA support is there... but we could make it better and easier to use. We have a problem today with port sharing and there was a proposal to address the issue by tighter integration (see the beginning of the thread) but the proposal got shot down immediately... because it is RDMA and not for technical reasons. I believe this email threads shows in detail how RDMA (a network technology) is treated as bastard child by the network folks, well at least by one of them. > > RDMA is not a problem, but how it influence to the network stack is. > Let's better think about how to work correctly with network stack > (since > we already have that cr^Wdifferent hardware) instead of saying that > others do bad work and do not allow shiny new feature to exist. By no means did I want to imply that others do bad work; are you referring to me using TSO implementation issues as an example? - If so, let me clarify: I understand that the TSO implementation took some time to get right. What I was referring to is that TSO(/LRO) have their own issues, some eluded to by Roland and me. In fact, customers working on the LSR couldn't use TSO due to the burstiness it introduces and had to fall-back to our fine grained packet scheduling done in the offload device. I am for variety, let us support new technologies that solve real problems (lots of folks are buying this stuff for a reason) instead of the 'ah, its brain-dead and has no future' attitude... there is precedence for offloading the host CPUs: have a look at graphics. Graphics used to be done by the host CPU and now we have dedicated graphics adapters that do a much better job... so, why is it so farfetched that offload devices can do a better job at a data-flow problem? > > -- > Evgeniy Polyakov - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.23-rc3 and SKY2 driver issue
On Mon, 20 Aug 2007 09:23:46 -0700 (PDT) James Corey <[EMAIL PROTECTED]> wrote: > > --- Stephen Hemminger > <[EMAIL PROTECTED]> wrote: > > > On Mon, 20 Aug 2007 08:42:21 -0700 (PDT) > > James Corey <[EMAIL PROTECTED]> wrote: > > > > > > > > --- Stephen Hemminger > > > <[EMAIL PROTECTED]> wrote: > > > > > > > On Thu, 16 Aug 2007 10:25:45 +0200 > > > > "Michal Piotrowski" > > > > > > > Please reproduce with a more recent kernel? > > > > > > Um, I thought 2.6.23rc WAS pretty recent. :-) > > > > > > I'll check if there is something newer in the > > > repository now. > > > > > > > What is the chip version? Please send console log: > > "dmesg | grep sky2" > > > > > > -- > > Stephen Hemminger <[EMAIL PROTECTED]> > > > > > Ah ... details. > > Machine: > > Dell Optiplex 745 > > Kernel: > > 2.6.23-rc3 #1 SMP Tue Aug 14 19:44:07 EDT 2007 x86_64 > x86_64 x86_64 GNU/Linux > > Card: > > D-Link DGE-550SX > > # dmesg | grep sky2 > sky2 :04:00.0: v1.16 addr 0xdf9fc000 irq 16 > Yukon-XL (0xb3) rev 3 > sky2 eth1: addr 00:17:9a:73:87:60 > sky2 eth1: enabling interface > sky2 eth1: ram buffer 96K > sky2 eth1: Link is up at 1000 Mbps, full duplex, flow > control none Okay, this is a fiber based card. Does the error happen right away (ie all packets have bad sum), or is it sporadic (ie some magic packet or race in hardware). Also are you using regular (1500) or jumbo (9000) mtu? -- Stephen Hemminger <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCPportsfrom the host TCP port space.
> -Original Message- > From: [EMAIL PROTECTED] [mailto:[EMAIL PROTECTED] On Behalf Of Andi Kleen > Sent: Monday, August 20, 2007 4:07 AM > To: Felix Marti > Cc: David Miller; [EMAIL PROTECTED]; netdev@vger.kernel.org; > [EMAIL PROTECTED]; [EMAIL PROTECTED]; linux- > [EMAIL PROTECTED]; [EMAIL PROTECTED] > Subject: Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate > PS_TCPportsfrom the host TCP port space. > > "Felix Marti" <[EMAIL PROTECTED]> writes: > > > avoidance gains of TSO and LRO are still a very worthwhile savings. > > So, i.e. with TSO, your saving about 16 headers (let us say 14 + 20 + > > 20), 864B, when moving ~64KB of payload - looks like very much in the > > noise to me. > > TSO is beneficial for the software again. The linux code currently > takes several locks and does quite a few function calls for each > packet and using larger packets lowers this overhead. At least with > 10GbE saving CPU cycles is still quite important. > > > an option to get 'high performance' > > Shouldn't you qualify that? > > It is unlikely you really duplicated all the tuning for corner cases > that went over many years into good software TCP stacks in your > hardware. So e.g. for wide area networks with occasional packet loss > the software might well perform better. Yes, it used to be sufficient to submit performance data to show that a technology make 'sense'. In fact, I believe it was Alan Cox who once said that linux will have a look at offload once an offload device holds the land speed record (probably assuming that the day never comes ;). For the last few years it has been Chelsio offload devices that have been improving their own LSRs (as IO bus speeds have been increasing). It is worthwhile to point out that OC-192 doesn't offer full 10Gbps BW and the fine-grained (per packet and not per TSO-burst) packet scheduler in the offload device played a crucial part in pushing performance to the limits of what OC-192 can do. Most other customers use our offload products in low-latency cluster environments. - The problem with offload devices is that they are not all born equal and there have been a lot of poor implementation giving the technology a bad name. I can only speak for Chelsio and do claim that we have a solid implementation that scales from low-latency clusters environments to LFNs. Andi, I could present performance numbers, i.e. throughput and CPU utilization in function of IO size, number of connections, ... in a back-to-back environment and/or in a cluster environment... but what will it get me? I'd still get hit by the 'not integrated' hammer :( > > -Andi - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Linksys Gigabit USB2.0 adapter (asix) regression
David Hollis wrote: > It's a bit of a longshot, but I notice that EEPROM index 0x17 returns > 0x580 for you, 0x180 for my devices. Based on that, my devices go > through the "gpio phymode == 1 path" GPIO init sequence, and yours goes > through the other path ( if ((eeprom >> 8) != 1) { ). Comment out the > if() else portion so that you go through the "phymode == 1" path and see > if that makes a difference. That segment should look something like > this: > > /* > if ((eeprom >> 8) != 1) { > asix_write_gpio(dev, 0x003c, 30); > asix_write_gpio(dev, 0x001c, 300); > asix_write_gpio(dev, 0x003c, 30); > } else { > */ > dbg("gpio phymode == 1 path"); > asix_write_gpio(dev, AX_GPIO_GPO1EN, 30); > asix_write_gpio(dev, AX_GPIO_GPO1EN | AX_GPIO_GPO_1, > 30); > // } Tried, but now it doesn't work at all, no LEDs and no traffic. - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.23-rc3 and SKY2 driver issue
--- Stephen Hemminger <[EMAIL PROTECTED]> wrote: > On Mon, 20 Aug 2007 08:42:21 -0700 (PDT) > James Corey <[EMAIL PROTECTED]> wrote: > > > > > --- Stephen Hemminger > > <[EMAIL PROTECTED]> wrote: > > > > > On Thu, 16 Aug 2007 10:25:45 +0200 > > > "Michal Piotrowski" > > > > > Please reproduce with a more recent kernel? > > > > Um, I thought 2.6.23rc WAS pretty recent. :-) > > > > I'll check if there is something newer in the > > repository now. > > > > What is the chip version? Please send console log: > "dmesg | grep sky2" > > > -- > Stephen Hemminger <[EMAIL PROTECTED]> > Ah ... details. Machine: Dell Optiplex 745 Kernel: 2.6.23-rc3 #1 SMP Tue Aug 14 19:44:07 EDT 2007 x86_64 x86_64 x86_64 GNU/Linux Card: D-Link DGE-550SX # dmesg | grep sky2 sky2 :04:00.0: v1.16 addr 0xdf9fc000 irq 16 Yukon-XL (0xb3) rev 3 sky2 eth1: addr 00:17:9a:73:87:60 sky2 eth1: enabling interface sky2 eth1: ram buffer 96K sky2 eth1: Link is up at 1000 Mbps, full duplex, flow control none Be a better Globetrotter. Get better travel answers from someone who knows. Yahoo! Answers - Check it out. http://answers.yahoo.com/dir/?link=list&sid=396545469 - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: skb_pull_rcsum - Fatal exception in interrupt
Alan J. Wylie wrote: > We have been shipping Linux based servers to customers for several > years now, with few problems. Recently, however, a single customer has > been seeing kernel panics. Unfortunately, the customer is about 200 > miles away, so physical access is limited. There are two ethernet > interfaces, one should be plugged into a local RFC1918 network, the > other is connected to the internet. If eth0 is plugged into the local > network, a short time later the system panics. > > Hardware: Intel S5000VSA server > > Network cards: Intel e1000 >Intel Corporation 80003ES2LAN Gigabit Ethernet Controller (Copper) Hi Alan, I work on the team that supports e1000, I'd be interested in seeing the dmesg output from the machine before it crashes, maybe you can add that to your web collection of data below? many of the 5000 series machines have BMC's its possible that you could set up the remote management so you could reboot it remotely, but that may not be worth the extra effort. It could however give you the ability to have a serial console over ethernet, which would get us the full panic message, but see below. > # CONFIG_E1000_DISABLE_PACKET_SPLIT is not set can you try setting the CONFIG_E1000_DISABLE_PACKET_SPLIT=y this will prevent the driver from splitting the header from the packet data which could be exacerbating this problem. Its not immediately obvious whether this is a kernel or driver problem, I hope you don't mind I cc'd e1000-devel since this is possibly relevant to other e1000 users and developers. > We shipped a second system, and this displayed identical symptoms. We > have tested with several recent 2.6 kernels, including > > 2.6.22 > 2.6.17.14 > 2.6.20.15 > > all of which crash. > > We have a couple of photographs showing the tail end of the messages > on the screen. > > The last two lines are: > > EIP: [] skb_pull_rcsum+0x6d/0x71 SS:ESP 09068:c03e1ea4 > Kernel panic - not syncing: Fatal exception in interrupt can you boot with vga=0x318 appended to kernel options? this might help you get more on the screen. you could also look into netconsole, but because this is a networking crash I don't know if you'll get data out of netconsole or not, and I don't know if you can use netconsole over the 'net' as I've only used it for local logging. Jesse - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.23-rc3 and SKY2 driver issue
On Mon, 20 Aug 2007 08:42:21 -0700 (PDT) James Corey <[EMAIL PROTECTED]> wrote: > > --- Stephen Hemminger > <[EMAIL PROTECTED]> wrote: > > > On Thu, 16 Aug 2007 10:25:45 +0200 > > "Michal Piotrowski" > > > Please reproduce with a more recent kernel? > > Um, I thought 2.6.23rc WAS pretty recent. :-) > > I'll check if there is something newer in the > repository now. > What is the chip version? Please send console log: "dmesg | grep sky2" -- Stephen Hemminger <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [3/4] 2.6.23-rc3: known regressions v2
Hi all, Here is a list of some known regressions in 2.6.23-rc3. Feel free to add new regressions/remove fixed etc. http://kernelnewbies.org/known_regressions List of Aces NameRegressions fixed since 21-Jun-2007 Adrian Bunk9 Andi Kleen 5 Linus Torvalds 5 Andrew Morton 4 Al Viro3 Cornelia Huck 3 Jens Axboe 3 Tejun Heo 3 Networking Subject : NETDEV WATCHDOG: eth0: transmit timed out References : http://lkml.org/lkml/2007/8/13/737 Last known good : ? Submitter : Karl Meyer <[EMAIL PROTECTED]> Caused-By : ? Handled-By : Francois Romieu <[EMAIL PROTECTED]> Status : problem is being debugged Subject : Weird network problems with 2.6.23-rc2 References : http://lkml.org/lkml/2007/8/11/40 Last known good : ? Submitter : Shish <[EMAIL PROTECTED]> Caused-By : ? Handled-By : ? Status : unknown Subject : IP v4 routing is broken References : http://www.stardust.webpages.pl/files/tbf/bugs/bug_report01.txt Last known good : 2.6.22-git2 Submitter : Uwe Bugla <[EMAIL PROTECTED]> Caused-By : ? Handled-By : ? Status : unknown Subject : New wake ups from sky2 References : http://lkml.org/lkml/2007/7/20/386 Last known good : ? Submitter : Thomas Meyer <[EMAIL PROTECTED]> Caused-By : Stephen Hemminger <[EMAIL PROTECTED]> commit eb35cf60e462491249166182e3e755d3d5d91a28 Handled-By : Stephen Hemminger <[EMAIL PROTECTED]> Status : unknown Virtualization Subject : CONFIG_VMI broken References : http://lkml.org/lkml/2007/8/14/203 Last known good : ? Submitter : Parag Warudkar <[EMAIL PROTECTED]> Caused-By : ? Handled-By : Zachary Amsden <[EMAIL PROTECTED]> Status : problem is being debugged Regards, Michal -- LOG http://www.stardust.webpages.pl/log/ - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[1/2] 2.6.23-rc3: known regressions with patches v2
Hi all, Here is a list of some known regressions in 2.6.23-rc3 with patches available. Feel free to add new regressions/remove fixed etc. http://kernelnewbies.org/known_regressions List of Aces NameRegressions fixed since 21-Jun-2007 Adrian Bunk9 Andi Kleen 5 Linus Torvalds 5 Andrew Morton 4 Al Viro3 Cornelia Huck 3 Jens Axboe 3 Tejun Heo 3 Unclassified Subject : Oops while modprobing phy fixed module References : http://lkml.org/lkml/2007/7/14/63 Last known good : ? Submitter : Gabriel C <[EMAIL PROTECTED]> Caused-By : ? Handled-By : Satyam Sharma <[EMAIL PROTECTED]> Vitaly Bordug <[EMAIL PROTECTED]> Patch1 : http://lkml.org/lkml/2007/7/18/506 Status : patch available MMC Subject : Unable to access memory card reader anymore References : http://bugzilla.kernel.org/show_bug.cgi?id=8885 Last known good : ? Submitter : Christian Casteyde <[EMAIL PROTECTED]> Caused-By : ? Handled-By : Alan Stern <[EMAIL PROTECTED]> Patch : http://bugzilla.kernel.org/attachment.cgi?id=12438 Status : patch available MTD Subject : error: implicit declaration of function 'cfi_interleave' References : http://lkml.org/lkml/2007/8/6/272 Last known good : ? Submitter : Ingo Molnar <[EMAIL PROTECTED]> Caused-By : ? Handled-By : David Woodhouse <[EMAIL PROTECTED]> Patch : http://lkml.org/lkml/2007/8/9/586 Status : patch available Networking Subject : BUG: when using 'brctl stp' References : http://lkml.org/lkml/2007/8/10/441 Last known good : 2.6.23-rc1 Submitter : Daniel K. <[EMAIL PROTECTED]> Caused-By : ? Handled-By : Stephen Hemminger <[EMAIL PROTECTED]> Status : fix applied by David Miller Subject : sky2 boot crash in sky2_mac_intr References : http://lkml.org/lkml/2007/7/24/91 Last known good : ? Submitter : Florian Lohoff <[EMAIL PROTECTED]> Caused-By : Handled-By : Stephen Hemminger <[EMAIL PROTECTED]> Patch : http://marc.info/?l=linux-netdev&m=118651402523966&w=2 Status : patch available Regards, Michal -- LOG http://www.stardust.webpages.pl/log/ - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 10/10] net/bonding: Destroy bonding master when last slave is gone
When bonding enslaves non Ethernet devices it takes pointers to functions in the module that owns the slaves. In this case it becomes unsafe to keep the bonding master registered after last slave was unenslaved because we don't know if the pointers are still valid. Destroying the bond when slave_cnt is zero ensures that these functions be used anymore. Signed-off-by: Moni Shoua <[EMAIL PROTECTED]> --- drivers/net/bonding/bond_main.c | 45 +++- drivers/net/bonding/bonding.h |3 ++ 2 files changed, 47 insertions(+), 1 deletion(-) Index: net-2.6/drivers/net/bonding/bond_main.c === --- net-2.6.orig/drivers/net/bonding/bond_main.c2007-08-20 14:43:17.123702132 +0300 +++ net-2.6/drivers/net/bonding/bond_main.c 2007-08-20 14:43:17.850571535 +0300 @@ -1256,6 +1256,7 @@ static int bond_compute_features(struct static void bond_setup_by_slave(struct net_device *bond_dev, struct net_device *slave_dev) { + struct bonding *bond = bond_dev->priv; bond_dev->hard_header = slave_dev->hard_header; bond_dev->rebuild_header= slave_dev->rebuild_header; bond_dev->hard_header_cache = slave_dev->hard_header_cache; @@ -1270,6 +1271,7 @@ static void bond_setup_by_slave(struct n memcpy(bond_dev->broadcast, slave_dev->broadcast, slave_dev->addr_len); + bond->setup_by_slave = 1; } /* enslave device to bond device */ @@ -1838,6 +1840,35 @@ int bond_release(struct net_device *bond } /* +* Destroy a bonding device. +* Must be under rtnl_lock when this function is called. +*/ +void bond_destroy(struct bonding *bond) +{ + bond_deinit(bond->dev); + bond_destroy_sysfs_entry(bond); + unregister_netdevice(bond->dev); +} + +/* +* First release a slave and than destroy the bond if no more slaves iare left. +* Must be under rtnl_lock when this function is called. +*/ +int bond_release_and_destroy(struct net_device *bond_dev, struct net_device *slave_dev) +{ + struct bonding *bond = bond_dev->priv; + int ret; + + ret = bond_release(bond_dev, slave_dev); + if ((ret == 0) && (bond->slave_cnt == 0)) { + printk(KERN_INFO DRV_NAME " %s: destroying bond for.\n", + bond_dev->name); + bond_destroy(bond); + } + return ret; +} + +/* * This function releases all slaves. */ static int bond_release_all(struct net_device *bond_dev) @@ -3322,7 +3353,11 @@ static int bond_slave_netdev_event(unsig switch (event) { case NETDEV_UNREGISTER: if (bond_dev) { - bond_release(bond_dev, slave_dev); + dprintk("slave %s unregisters\n", slave_dev->name); + if (bond->setup_by_slave) + bond_release_and_destroy(bond_dev, slave_dev); + else + bond_release(bond_dev, slave_dev); } break; case NETDEV_CHANGE: @@ -3331,6 +3366,13 @@ static int bond_slave_netdev_event(unsig * sets up a hierarchical bond, then rmmod's * one of the slave bonding devices? */ + if (slave_dev->priv_flags & IFF_SLAVE_DETACH) { + dprintk("slave %s detaching\n", slave_dev->name); + if (bond->setup_by_slave) + bond_release_and_destroy(bond_dev, slave_dev); + else + bond_release(bond_dev, slave_dev); + } break; case NETDEV_DOWN: /* @@ -4311,6 +4353,7 @@ static int bond_init(struct net_device * bond->primary_slave = NULL; bond->dev = bond_dev; bond->send_grat_arp = 0; + bond->setup_by_slave = 0; INIT_LIST_HEAD(&bond->vlan_list); /* Initialize the device entry points */ Index: net-2.6/drivers/net/bonding/bonding.h === --- net-2.6.orig/drivers/net/bonding/bonding.h 2007-08-20 14:43:17.123702132 +0300 +++ net-2.6/drivers/net/bonding/bonding.h 2007-08-20 14:47:52.845180870 +0300 @@ -188,6 +188,7 @@ struct bonding { s8 kill_timers; s8 do_set_mac_addr; s8 send_grat_arp; + s8 setup_by_slave; struct net_device_stats stats; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_entry; @@ -295,6 +296,8 @@ static inline void bond_unset_master_alb struct vlan_entry *bond_next_vlan(struct bonding *bond, struct vlan_entry *curr); int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev); int bond_create(char *name, struct bond_params *params, struct bonding **newb
PATCH V4 9/10] net/bonding: Delay sending of gratuitous ARP to avoid failure
Delay sending a gratuitous_arp when LINK_STATE_LINKWATCH_PENDING bit in dev->state field is on. This improves the chances for the arp packet to be transmitted. Signed-off-by: Moni Shoua <[EMAIL PROTECTED]> --- drivers/net/bonding/bond_main.c | 24 +--- drivers/net/bonding/bonding.h |1 + 2 files changed, 22 insertions(+), 3 deletions(-) Index: net-2.6/drivers/net/bonding/bond_main.c === --- net-2.6.orig/drivers/net/bonding/bond_main.c2007-08-15 10:56:33.0 +0300 +++ net-2.6/drivers/net/bonding/bond_main.c 2007-08-15 11:04:37.221123652 +0300 @@ -1102,8 +1102,14 @@ void bond_change_active_slave(struct bon if (new_active && !bond->do_set_mac_addr) memcpy(bond->dev->dev_addr, new_active->dev->dev_addr, new_active->dev->addr_len); - - bond_send_gratuitous_arp(bond); + if (bond->curr_active_slave && + test_bit(__LINK_STATE_LINKWATCH_PENDING, + &bond->curr_active_slave->dev->state)) { + dprintk("delaying gratuitous arp on %s\n", + bond->curr_active_slave->dev->name); + bond->send_grat_arp = 1; + } else + bond_send_gratuitous_arp(bond); } } @@ -2083,6 +2089,17 @@ void bond_mii_monitor(struct net_device * program could monitor the link itself if needed. */ + if (bond->send_grat_arp) { + if (bond->curr_active_slave && test_bit(__LINK_STATE_LINKWATCH_PENDING, + &bond->curr_active_slave->dev->state)) + dprintk("Needs to send gratuitous arp but not yet\n"); + else { + dprintk("sending delayed gratuitous arp on on %s\n", + bond->curr_active_slave->dev->name); + bond_send_gratuitous_arp(bond); + bond->send_grat_arp = 0; + } + } read_lock(&bond->curr_slave_lock); oldcurrent = bond->curr_active_slave; read_unlock(&bond->curr_slave_lock); @@ -2484,7 +2501,7 @@ static void bond_send_gratuitous_arp(str if (bond->master_ip) { bond_arp_send(slave->dev, ARPOP_REPLY, bond->master_ip, - bond->master_ip, 0); + bond->master_ip, 0); } list_for_each_entry(vlan, &bond->vlan_list, vlan_list) { @@ -4293,6 +4310,7 @@ static int bond_init(struct net_device * bond->current_arp_slave = NULL; bond->primary_slave = NULL; bond->dev = bond_dev; + bond->send_grat_arp = 0; INIT_LIST_HEAD(&bond->vlan_list); /* Initialize the device entry points */ Index: net-2.6/drivers/net/bonding/bonding.h === --- net-2.6.orig/drivers/net/bonding/bonding.h 2007-08-15 10:56:33.0 +0300 +++ net-2.6/drivers/net/bonding/bonding.h 2007-08-15 11:05:41.516451497 +0300 @@ -187,6 +187,7 @@ struct bonding { struct timer_list arp_timer; s8 kill_timers; s8 do_set_mac_addr; + s8 send_grat_arp; struct net_device_stats stats; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_entry; - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 7/10] net/bonding: Enable IP multicast for bonding IPoIB devices
Allow to enslave devices when the bonding device is not up. Over the discussion held at the previous post this seemed to be the most clean way to go, where it is not expected to cause instabilities. Normally, the bonding driver is UP before any enslavement takes place. Once a netdevice is UP, the network stack acts to have it join some multicast groups (eg the all-hosts 224.0.0.1). Now, since ether_setup() have set the bonding device type to be ARPHRD_ETHER and address len to be ETHER_ALEN, the net core code computes a wrong multicast link address. This is b/c ip_eth_mc_map() is called where for multicast joins taking place after the enslavement another ip_xxx_mc_map() is called (eg ip_ib_mc_map() when the bond type is ARPHRD_INFINIBAND) Signed-off-by: Moni Shoua <[EMAIL PROTECTED]> Signed-off-by: Or Gerlitz <[EMAIL PROTECTED]> --- drivers/net/bonding/bond_main.c |5 +++-- drivers/net/bonding/bond_sysfs.c |6 ++ 2 files changed, 5 insertions(+), 6 deletions(-) Index: net-2.6/drivers/net/bonding/bond_main.c === --- net-2.6.orig/drivers/net/bonding/bond_main.c2007-08-15 10:54:41.0 +0300 +++ net-2.6/drivers/net/bonding/bond_main.c 2007-08-15 10:55:48.431862446 +0300 @@ -1285,8 +1285,9 @@ int bond_enslave(struct net_device *bond /* bond must be initialized by bond_open() before enslaving */ if (!(bond_dev->flags & IFF_UP)) { - dprintk("Error, master_dev is not up\n"); - return -EPERM; + printk(KERN_WARNING DRV_NAME + " %s: master_dev is not up in bond_enslave\n", + bond_dev->name); } /* already enslaved */ Index: net-2.6/drivers/net/bonding/bond_sysfs.c === --- net-2.6.orig/drivers/net/bonding/bond_sysfs.c 2007-08-15 10:08:58.0 +0300 +++ net-2.6/drivers/net/bonding/bond_sysfs.c2007-08-15 10:55:48.432862269 +0300 @@ -266,11 +266,9 @@ static ssize_t bonding_store_slaves(stru /* Quick sanity check -- is the bond interface up? */ if (!(bond->dev->flags & IFF_UP)) { - printk(KERN_ERR DRV_NAME - ": %s: Unable to update slaves because interface is down.\n", + printk(KERN_WARNING DRV_NAME + ": %s: doing slave updates when interface is down.\n", bond->dev->name); - ret = -EPERM; - goto out; } /* Note: We can't hold bond->lock here, as bond_create grabs it. */ - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 8/10] net/bonding: Handlle wrong assumptions that slave is always an Ethernet device
bonding sometimes uses Ethernet constants (such as MTU and address length) which are not good when it enslaves non Ethernet devices (such as InfiniBand). Signed-off-by: Moni Shoua <[EMAIL PROTECTED]> --- drivers/net/bonding/bond_main.c |3 ++- drivers/net/bonding/bond_sysfs.c | 19 +-- drivers/net/bonding/bonding.h|1 + 3 files changed, 16 insertions(+), 7 deletions(-) Index: net-2.6/drivers/net/bonding/bond_main.c === --- net-2.6.orig/drivers/net/bonding/bond_main.c2007-08-15 10:55:48.0 +0300 +++ net-2.6/drivers/net/bonding/bond_main.c 2007-08-20 14:29:11.911298577 +0300 @@ -1224,7 +1224,8 @@ static int bond_compute_features(struct struct slave *slave; struct net_device *bond_dev = bond->dev; unsigned long features = bond_dev->features; - unsigned short max_hard_header_len = ETH_HLEN; + unsigned short max_hard_header_len = max((u16)ETH_HLEN, + bond_dev->hard_header_len); int i; features &= ~(NETIF_F_ALL_CSUM | BOND_VLAN_FEATURES); Index: net-2.6/drivers/net/bonding/bond_sysfs.c === --- net-2.6.orig/drivers/net/bonding/bond_sysfs.c 2007-08-15 10:55:48.0 +0300 +++ net-2.6/drivers/net/bonding/bond_sysfs.c2007-08-15 12:14:41.152469089 +0300 @@ -164,9 +164,7 @@ static ssize_t bonding_store_bonds(struc printk(KERN_INFO DRV_NAME ": %s is being deleted...\n", bond->dev->name); - bond_deinit(bond->dev); - bond_destroy_sysfs_entry(bond); - unregister_netdevice(bond->dev); + bond_destroy(bond); rtnl_unlock(); goto out; } @@ -260,6 +258,7 @@ static ssize_t bonding_store_slaves(stru char command[IFNAMSIZ + 1] = { 0, }; char *ifname; int i, res, found, ret = count; + u32 original_mtu; struct slave *slave; struct net_device *dev = NULL; struct bonding *bond = to_bond(d); @@ -325,6 +324,7 @@ static ssize_t bonding_store_slaves(stru } /* Set the slave's MTU to match the bond */ + original_mtu = dev->mtu; if (dev->mtu != bond->dev->mtu) { if (dev->change_mtu) { res = dev->change_mtu(dev, @@ -339,6 +339,9 @@ static ssize_t bonding_store_slaves(stru } rtnl_lock(); res = bond_enslave(bond->dev, dev); + bond_for_each_slave(bond, slave, i) + if (strnicmp(slave->dev->name, ifname, IFNAMSIZ) == 0) + slave->original_mtu = original_mtu; rtnl_unlock(); if (res) { ret = res; @@ -351,13 +354,17 @@ static ssize_t bonding_store_slaves(stru bond_for_each_slave(bond, slave, i) if (strnicmp(slave->dev->name, ifname, IFNAMSIZ) == 0) { dev = slave->dev; + original_mtu = slave->original_mtu; break; } if (dev) { printk(KERN_INFO DRV_NAME ": %s: Removing slave %s\n", bond->dev->name, dev->name); rtnl_lock(); - res = bond_release(bond->dev, dev); + if (bond->setup_by_slave) + res = bond_release_and_destroy(bond->dev, dev); + else + res = bond_release(bond->dev, dev); rtnl_unlock(); if (res) { ret = res; @@ -365,9 +372,9 @@ static ssize_t bonding_store_slaves(stru } /* set the slave MTU to the default */ if (dev->change_mtu) { - dev->change_mtu(dev, 1500); + dev->change_mtu(dev, original_mtu); } else { - dev->mtu = 1500; + dev->mtu = original_mtu; } } else { Index: net-2.6/drivers/net/bonding/bonding.h === --- net-2.6.orig/drivers/net/bonding/bonding.h 2007-08-15 10:55:34.0 +0300 +++ net-2.6/drivers/net/bonding/bonding.h 2007-08-20 14:29:11.912298402 +0300 @@ -156,6 +156,7 @@ struct slave {
Re: 2.6.23-rc3 and SKY2 driver issue
--- Stephen Hemminger <[EMAIL PROTECTED]> wrote: > On Thu, 16 Aug 2007 10:25:45 +0200 > "Michal Piotrowski" > <[EMAIL PROTECTED]> wrote: > > > [Adding Stephen and netdev to CC] > > > > On 15/08/07, James Corey <[EMAIL PROTECTED]> > wrote: > > > > > > I tried running a D-link gig card on kernel > 2.6.21.1 > > > and it came up fine, but when I did a sftp of > > > an linux dvd ISO to it, the interface would lock > > > up hard with the error > > > > > > eth1: hw csum failure. > > > > > > Call Trace: > > >[] > > > __skb_checksum_complete_head+0x46/0x5f > > > [] > __skb_checksum_complete+0xc/0x11 > > > [] tcp_v4_rcv+0x157/0x810 > > > [] dev_queue_xmit+0x237/0x260 > > > [] > find_busiest_group+0x252/0x684 > > > [] > ip_local_deliver+0xca/0x14c > > > [] ip_rcv+0x478/0x4ba > > > [] sky2_poll+0x6f9/0x9b9 > > > [] > > > run_rebalance_domains+0x13e/0x408 > > > [] net_rx_action+0xa8/0x166 > > > [] __do_softirq+0x55/0xc3 > > > [] call_softirq+0x1c/0x28 > > > [] do_softirq+0x2c/0x7d > > > [] do_IRQ+0x13e/0x15f > > > [] mwait_idle+0x0/0x46 > > > [] ret_from_intr+0x0/0xa > > >[] > mwait_idle+0x42/0x46 > > > [] cpu_idle+0x8c/0xaf > > > [] start_kernel+0x2ac/0x2b8 > > > [] _sinittext+0x140/0x144 > > > > > > > > > So I tried running the latest snapshot > 2.6.23-rc3 > > > and the almost the same thing happens. Only > > > difference is that now the entire box locks up. > > > The error is almost the same > > > > > > eth1: hw csum failure. > > > > > > Call Trace: > > >[] > > > __skb_checksum_complete_head+0x43/0x56 > > > [] > __skb_checksum_complete+0xc/0x11 > > > [] tcp_v4_rcv+0x14e/0x801 > > > [] > ip_local_deliver+0xca/0x14c > > > [] ip_rcv+0x46c/0x4ae > > > [] > :sky2:sky2_poll+0x72b/0x9c7 > > > [] net_rx_action+0xa8/0x166 > > > [] __do_softirq+0x55/0xc4 > > > [] call_softirq+0x1c/0x28 > > > [] do_softirq+0x2c/0x7d > > > [] do_IRQ+0x13e/0x15f > > > [] mwait_idle+0x0/0x48 > > > [] ret_from_intr+0x0/0xa > > >[] > > > :sky2:sky2_xmit_frame+0x0/0x41d > > > [] mwait_idle+0x42/0x48 > > > [] cpu_idle+0xbd/0xe0 > > > [] start_kernel+0x2ac/0x2b8 > > > [] _sinittext+0x140/0x144 > > > > > > > > > I see that the new kernel includes some sort of > > > SKY2 DEBUG stuff. I would be happy to rerun > > > the test with that turned on, given some minor > > > direction. > > > > > > > Regards, > > Michal > > Please reproduce with a more recent kernel? Um, I thought 2.6.23rc WAS pretty recent. :-) I'll check if there is something newer in the repository now. -J Take the Internet to Go: Yahoo!Go puts the Internet in your pocket: mail, news, photos & more. http://mobile.yahoo.com/go?refer=1GNXIC - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 6/10] net/bonding: Enable bonding to enslave netdevices not supporting set_mac_address()
This patch allows for enslaving netdevices which do not support the set_mac_address() function. In that case the bond mac address is the one of the active slave, where remote peers are notified on the mac address (neighbour) change by Gratuitous ARP sent by bonding when fail-over occurs (this is already done by the bonding code). Signed-off-by: Moni Shoua <[EMAIL PROTECTED]> Signed-off-by: Or Gerlitz <[EMAIL PROTECTED]> --- drivers/net/bonding/bond_main.c | 87 +++- drivers/net/bonding/bonding.h |1 2 files changed, 60 insertions(+), 28 deletions(-) Index: net-2.6/drivers/net/bonding/bond_main.c === --- net-2.6.orig/drivers/net/bonding/bond_main.c2007-08-15 10:54:13.0 +0300 +++ net-2.6/drivers/net/bonding/bond_main.c 2007-08-15 10:54:41.971632881 +0300 @@ -1095,6 +1095,14 @@ void bond_change_active_slave(struct bon if (new_active) { bond_set_slave_active_flags(new_active); } + + /* when bonding does not set the slave MAC address, the bond MAC +* address is the one of the active slave. +*/ + if (new_active && !bond->do_set_mac_addr) + memcpy(bond->dev->dev_addr, new_active->dev->dev_addr, + new_active->dev->addr_len); + bond_send_gratuitous_arp(bond); } } @@ -1351,13 +1359,22 @@ int bond_enslave(struct net_device *bond } if (slave_dev->set_mac_address == NULL) { - printk(KERN_ERR DRV_NAME - ": %s: Error: The slave device you specified does " - "not support setting the MAC address. " - "Your kernel likely does not support slave " - "devices.\n", bond_dev->name); - res = -EOPNOTSUPP; - goto err_undo_flags; + if (bond->slave_cnt == 0) { + printk(KERN_WARNING DRV_NAME + ": %s: Warning: The first slave device you " + "specified does not support setting the MAC " + "address. This bond MAC address would be that " + "of the active slave.\n", bond_dev->name); + bond->do_set_mac_addr = 0; + } else if (bond->do_set_mac_addr) { + printk(KERN_ERR DRV_NAME + ": %s: Error: The slave device you specified " + "does not support setting the MAC addres,." + "but this bond uses this practice. \n" + , bond_dev->name); + res = -EOPNOTSUPP; + goto err_undo_flags; + } } new_slave = kzalloc(sizeof(struct slave), GFP_KERNEL); @@ -1378,16 +1395,18 @@ int bond_enslave(struct net_device *bond */ memcpy(new_slave->perm_hwaddr, slave_dev->dev_addr, ETH_ALEN); - /* -* Set slave to master's mac address. The application already -* set the master's mac address to that of the first slave -*/ - memcpy(addr.sa_data, bond_dev->dev_addr, bond_dev->addr_len); - addr.sa_family = slave_dev->type; - res = dev_set_mac_address(slave_dev, &addr); - if (res) { - dprintk("Error %d calling set_mac_address\n", res); - goto err_free; + if (bond->do_set_mac_addr) { + /* +* Set slave to master's mac address. The application already +* set the master's mac address to that of the first slave +*/ + memcpy(addr.sa_data, bond_dev->dev_addr, bond_dev->addr_len); + addr.sa_family = slave_dev->type; + res = dev_set_mac_address(slave_dev, &addr); + if (res) { + dprintk("Error %d calling set_mac_address\n", res); + goto err_free; + } } res = netdev_set_master(slave_dev, bond_dev); @@ -1612,9 +1631,11 @@ err_close: dev_close(slave_dev); err_restore_mac: - memcpy(addr.sa_data, new_slave->perm_hwaddr, ETH_ALEN); - addr.sa_family = slave_dev->type; - dev_set_mac_address(slave_dev, &addr); + if (bond->do_set_mac_addr) { + memcpy(addr.sa_data, new_slave->perm_hwaddr, ETH_ALEN); + addr.sa_family = slave_dev->type; + dev_set_mac_address(slave_dev, &addr); + } err_free: kfree(new_slave); @@ -1792,10 +1813,12 @@ int bond_release(struct net_device *bond /* close slave before restoring its mac address */ dev_close(slave_dev); - /* restore original ("permanent") mac address */ - memcpy(addr.sa_da
[PATCH V4 5/10] net/bonding: Enable bonding to enslave non ARPHRD_ETHER
This patch changes some of the bond netdevice attributes and functions to be that of the active slave for the case of the enslaved device not being of ARPHRD_ETHER type. Basically it overrides those setting done by ether_setup(), which are netdevice **type** dependent and hence might be not appropriate for devices of other types. It also enforces mutual exclusion on bonding slaves from dissimilar ether types, as was concluded over the v1 discussion. IPoIB (see Documentation/infiniband/ipoib.txt) MAC address is made of a 3 bytes IB QP (Queue Pair) number and 16 bytes IB port GID (Global ID) of the port this IPoIB device is bounded to. The QP is a resource created by the IB HW and the GID is an identifier burned into the HCA (i have omitted here some details which are not important for the bonding RFC). Signed-off-by: Moni Shoua <[EMAIL PROTECTED]> Signed-off-by: Or Gerlitz <[EMAIL PROTECTED]> --- drivers/net/bonding/bond_main.c | 39 +++ 1 files changed, 39 insertions(+) Index: net-2.6/drivers/net/bonding/bond_main.c === --- net-2.6.orig/drivers/net/bonding/bond_main.c2007-08-15 10:08:59.0 +0300 +++ net-2.6/drivers/net/bonding/bond_main.c 2007-08-15 10:54:13.424688411 +0300 @@ -1237,6 +1237,26 @@ static int bond_compute_features(struct return 0; } + +static void bond_setup_by_slave(struct net_device *bond_dev, + struct net_device *slave_dev) +{ + bond_dev->hard_header = slave_dev->hard_header; + bond_dev->rebuild_header= slave_dev->rebuild_header; + bond_dev->hard_header_cache = slave_dev->hard_header_cache; + bond_dev->header_cache_update = slave_dev->header_cache_update; + bond_dev->hard_header_parse = slave_dev->hard_header_parse; + + bond_dev->neigh_setup = slave_dev->neigh_setup; + + bond_dev->type = slave_dev->type; + bond_dev->hard_header_len = slave_dev->hard_header_len; + bond_dev->addr_len = slave_dev->addr_len; + + memcpy(bond_dev->broadcast, slave_dev->broadcast, + slave_dev->addr_len); +} + /* enslave device to bond device */ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) { @@ -1311,6 +1331,25 @@ int bond_enslave(struct net_device *bond goto err_undo_flags; } + /* set bonding device ether type by slave - bonding netdevices are +* created with ether_setup, so when the slave type is not ARPHRD_ETHER +* there is a need to override some of the type dependent attribs/funcs. +* +* bond ether type mutual exclusion - don't allow slaves of dissimilar +* ether type (eg ARPHRD_ETHER and ARPHRD_INFINIBAND) share the same bond +*/ + if (bond->slave_cnt == 0) { + if (slave_dev->type != ARPHRD_ETHER) + bond_setup_by_slave(bond_dev, slave_dev); + } else if (bond_dev->type != slave_dev->type) { + printk(KERN_ERR DRV_NAME ": %s ether type (%d) is different " + "from other slaves (%d), can not enslave it.\n", + slave_dev->name, + slave_dev->type, bond_dev->type); + res = -EINVAL; + goto err_undo_flags; + } + if (slave_dev->set_mac_address == NULL) { printk(KERN_ERR DRV_NAME ": %s: Error: The slave device you specified does " - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 3/10] IB/ipoib: Bound the net device to the ipoib_neigh structue
IPoIB uses a two layer neighboring scheme, such that for each struct neighbour whose device is an ipoib one, there is a struct ipoib_neigh buddy which is created on demand at the tx flow by an ipoib_neigh_alloc(skb->dst->neighbour) call. When using the bonding driver, neighbours are created by the net stack on behalf of the bonding (master) device. On the tx flow the bonding code gets an skb such that skb->dev points to the master device, it changes this skb to point on the slave device and calls the slave hard_start_xmit function. Under this scheme, ipoib_neigh_destructor assumption that for each struct neighbour it gets, n->dev is an ipoib device and hence netdev_priv(n->dev) can be casted to struct ipoib_dev_priv is buggy. To fix it, this patch adds a dev field to struct ipoib_neigh which is used instead of the struct neighbour dev one, when n->dev->flags has the IFF_MASTER bit set. Signed-off-by: Moni Shoua <[EMAIL PROTECTED]> Signed-off-by: Or Gerlitz <[EMAIL PROTECTED]> --- drivers/infiniband/ulp/ipoib/ipoib.h |4 +++- drivers/infiniband/ulp/ipoib/ipoib_main.c | 17 +++-- drivers/infiniband/ulp/ipoib/ipoib_multicast.c |3 ++- 3 files changed, 20 insertions(+), 4 deletions(-) Index: net-2.6/drivers/infiniband/ulp/ipoib/ipoib.h === --- net-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib.h 2007-08-15 10:09:00.0 +0300 +++ net-2.6/drivers/infiniband/ulp/ipoib/ipoib.h2007-08-15 10:53:52.756348574 +0300 @@ -328,6 +328,7 @@ struct ipoib_neigh { struct sk_buff_head queue; struct neighbour *neighbour; + struct net_device *dev; struct list_headlist; }; @@ -344,7 +345,8 @@ static inline struct ipoib_neigh **to_ip INFINIBAND_ALEN, sizeof(void *)); } -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh); +struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh, + struct net_device *dev); void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh); extern struct workqueue_struct *ipoib_workqueue; Index: net-2.6/drivers/infiniband/ulp/ipoib/ipoib_main.c === --- net-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c 2007-08-15 10:53:28.0 +0300 +++ net-2.6/drivers/infiniband/ulp/ipoib/ipoib_main.c 2007-08-15 10:53:52.757348397 +0300 @@ -511,7 +511,7 @@ static void neigh_add_path(struct sk_buf struct ipoib_path *path; struct ipoib_neigh *neigh; - neigh = ipoib_neigh_alloc(skb->dst->neighbour); + neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev); if (!neigh) { ++priv->stats.tx_dropped; dev_kfree_skb_any(skb); @@ -830,6 +830,17 @@ static void ipoib_neigh_cleanup(struct n unsigned long flags; struct ipoib_ah *ah = NULL; + if (n->dev->flags & IFF_MASTER) { + /* n->dev is not an IPoIB device and we have + to take priv from elsewhere */ + neigh = *to_ipoib_neigh(n); + if (neigh) { + priv = netdev_priv(neigh->dev); + ipoib_dbg(priv, "neigh_destructor for bonding device: %s\n", + n->dev->name); + } else + return; + } ipoib_dbg(priv, "neigh_cleanup for %06x " IPOIB_GID_FMT "\n", IPOIB_QPN(n->ha), @@ -851,7 +862,8 @@ static void ipoib_neigh_cleanup(struct n ipoib_put_ah(ah); } -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour) +struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, + struct net_device *dev) { struct ipoib_neigh *neigh; @@ -860,6 +872,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(st return NULL; neigh->neighbour = neighbour; + neigh->dev = dev; *to_ipoib_neigh(neighbour) = neigh; skb_queue_head_init(&neigh->queue); ipoib_cm_set(neigh, NULL); Index: net-2.6/drivers/infiniband/ulp/ipoib/ipoib_multicast.c === --- net-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2007-08-15 10:09:00.0 +0300 +++ net-2.6/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2007-08-15 10:53:52.758348220 +0300 @@ -727,7 +727,8 @@ out: if (skb->dst&& skb->dst->neighbour && !*to_ipoib_neigh(skb->dst->neighbour)) { - struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour); + struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour, + skb->dev);
[PATCH V4 4/10] IB/ipoib: Verify address handle validity on send
When the bonding device senses a carrier loss of its active slave it replaces that slave with a new one. In between the times when the carrier of an IPoIB device goes down and ipoib_neigh is destroyed, it is possible that the bonding driver will send a packet on a new slave that uses an old ipoib_neigh. This patch detects and prevents this from happenning. Signed-off-by: Moni Shoua <[EMAIL PROTECTED]> Signed-off-by: Or Gerlitz <[EMAIL PROTECTED]> --- drivers/infiniband/ulp/ipoib/ipoib_main.c |5 +++-- 1 files changed, 3 insertions(+), 2 deletions(-) Index: net-2.6/drivers/infiniband/ulp/ipoib/ipoib_main.c === --- net-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c 2007-08-15 10:53:52.0 +0300 +++ net-2.6/drivers/infiniband/ulp/ipoib/ipoib_main.c 2007-08-15 10:54:03.959364640 +0300 @@ -686,9 +686,10 @@ static int ipoib_start_xmit(struct sk_bu goto out; } } else if (neigh->ah) { - if (unlikely(memcmp(&neigh->dgid.raw, + if (unlikely((memcmp(&neigh->dgid.raw, skb->dst->neighbour->ha + 4, - sizeof(union ib_gid { + sizeof(union ib_gid))) || +(neigh->dev != dev))) { spin_lock(&priv->lock); /* * It's safe to call ipoib_put_ah() inside - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 2/10] IB/ipoib: Notify the world before doing unregister
When the bonding device enslaves IPoIB devices it takes pointers to functions in the ib_ipoib module. This is fine as long as the ib_ipoib nodule remains loaded while the references to its functions exist. So, to help bonding do a cleanup on time, when the IPoIB net device is a slave of a bonding master, let the master know that the IPoIB device is about to unregister (but before calling unregister). Signed-off-by: Moni Shoua <[EMAIL PROTECTED]> --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 15 +++ 1 files changed, 15 insertions(+) Index: net-2.6/drivers/infiniband/ulp/ipoib/ipoib_main.c === --- net-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c 2007-08-20 14:29:29.522209580 +0300 +++ net-2.6/drivers/infiniband/ulp/ipoib/ipoib_main.c 2007-08-20 14:43:03.432162133 +0300 @@ -48,6 +48,7 @@ #include #include +#include MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); @@ -772,6 +773,18 @@ static void ipoib_timeout(struct net_dev /* XXX reset QP, etc. */ } +static int ipoib_slave_detach(struct net_device *dev) +{ + int ret = 0; + if (dev->flags & IFF_SLAVE) { + dev->priv_flags |= IFF_SLAVE_DETACH; + rtnl_lock(); + ret = call_netdevice_notifiers(NETDEV_CHANGE, dev); + rtnl_unlock(); + } + return ret; +} + static int ipoib_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, @@ -921,6 +934,7 @@ void ipoib_dev_cleanup(struct net_device /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { + ipoib_slave_detach(cpriv->dev); unregister_netdev(cpriv->dev); ipoib_dev_cleanup(cpriv->dev); free_netdev(cpriv->dev); @@ -1208,6 +1222,7 @@ static void ipoib_remove_one(struct ib_d ib_unregister_event_handler(&priv->event_handler); flush_scheduled_work(); + ipoib_slave_detach(priv->dev); unregister_netdev(priv->dev); ipoib_dev_cleanup(priv->dev); free_netdev(priv->dev); - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 1/10] IB/ipoib: Export call to call_netdevice_notifiers and add new private flag
Export the call to raw_notifier_call_chain so modules can send notifications on netdev events to the netdev_chain. Add IFF_SLAVE_DETACH to the list of priv_flags for net_device. This flag is set by a slave that is about to unregisster from the kernel. Both changes are used in bonding slaves that wish to inform the bonding master about coming detachment. Signed-off-by: Moni Shoua <[EMAIL PROTECTED]> --- include/linux/if.h |1 + net/core/dev.c |1 + 2 files changed, 2 insertions(+) Index: net-2.6/net/core/dev.c === --- net-2.6.orig/net/core/dev.c 2007-08-15 10:09:02.0 +0300 +++ net-2.6/net/core/dev.c 2007-08-15 10:53:00.832543390 +0300 @@ -1148,6 +1148,7 @@ int call_netdevice_notifiers(unsigned lo { return raw_notifier_call_chain(&netdev_chain, val, v); } +EXPORT_SYMBOL(call_netdevice_notifiers); /* When > 0 there are consumers of rx skb time stamps */ static atomic_t netstamp_needed = ATOMIC_INIT(0); Index: net-2.6/include/linux/if.h === --- net-2.6.orig/include/linux/if.h 2007-08-20 14:30:39.0 +0300 +++ net-2.6/include/linux/if.h 2007-08-20 14:31:06.625174369 +0300 @@ -61,6 +61,7 @@ #define IFF_MASTER_ALB 0x10/* bonding master, balance-alb. */ #define IFF_BONDING0x20/* bonding master or slave */ #define IFF_SLAVE_NEEDARP 0x40 /* need ARPs for validation */ +#define IFF_SLAVE_DETACH 0x80 /* slave is about to unregister */ #define IF_GET_IFACE 0x0001 /* for querying only */ #define IF_GET_PROTO 0x0002 - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 0/10] net/bonding: ADD IPoIB support for the bonding driver
This patch series is the fourth version (see below link to V3) of the suggested changes to the bonding driver so it would be able to support non ARPHRD_ETHER netdevices for its High-Availability (active-backup) mode. The motivation is to enable the bonding driver on its HA mode to work with the IP over Infiniband (IPoIB) driver. With these patches I was able to enslave IPoIB netdevices and run TCP, UDP, IP (UDP) Multicast and ICMP traffic with fail-over and fail-back working fine. The working environment was the net-2.6 git. More over, as IPoIB is also the IB ARP provider for the RDMA CM driver which is used by native IB ULPs whose addressing scheme is based on IP (e.g. iSER, SDP, Lustre, NFSoRDMA, RDS), bonding support for IPoIB devices **enables** HA for these ULPs. This holds as when the ULP is informed by the IB HW on the failure of the current IB connection, it just need to reconnect, where the bonding device will now issue the IB ARP over the active IPoIB slave. This series also includes patches to the IPoIB driver that fix some fix some neighboring related issues. Major changes from the previous version: 1) Addressing the issue of safety when unloading the IPoIB module before the bonding module 2) style changes Links to earlier discussion: 1. A discussion in netdev about bonding support for IPoIB. http://lists.openwall.net/netdev/2006/11/30/46 2. A discussion in openfabrics regarding changes in the IPoIB that enable using it as a slave for bonding. http://lists.openfabrics.org/pipermail/general/2007-July/038914.html - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 17/18] 3c59x: fix duplex configuration
On Fri, Aug 10, 2007 at 02:05:26PM -0700, [EMAIL PROTECTED] wrote: [...] > diff -puN drivers/net/3c59x.c~3c59x-fix-duplex-configuration > drivers/net/3c59x.c > --- a/drivers/net/3c59x.c~3c59x-fix-duplex-configuration > +++ a/drivers/net/3c59x.c > @@ -1559,6 +1559,7 @@ vortex_up(struct net_device *dev) > mii_reg1 = mdio_read(dev, vp->phys[0], MII_BMSR); > mii_reg5 = mdio_read(dev, vp->phys[0], MII_LPA); > vp->partner_flow_ctrl = ((mii_reg5 & 0x0400) != 0); > + vp->mii.full_duplex = vp->full_duplex; > > vortex_check_media(dev, 1); > } > _ Sorry for the late reply. I finally managed to get my notebook fixed so that I could actually test this patch. I can confirm that it fixes my duplex configuration problem. The steps described in http://bugzilla.kernel.org/show_bug.cgi?id=8575 now result in a Ethernet chip properly configured for full duplex. Thanks for the fix! The only remaining issue I have the 3c59x driver is the time required until it detects link loss when unplugging the Ethernet cable. At the moment, this needs up to 60 seconds which makes this feature pretty useless. Other drivers need 2-5 seconds for this which is roughly what I would have expected. I've been using the patch below sucessfully for a few weeks now which brings down this time to 5 seconds. Would be nice if somebody could apply it. Thanks, Martin --- drivers/net/3c59x.c.orig2007-08-20 17:01:06.0 +0200 +++ drivers/net/3c59x.c 2007-08-20 17:02:38.0 +0200 @@ -1726,7 +1726,7 @@ struct net_device *dev = (struct net_device *)data; struct vortex_private *vp = netdev_priv(dev); void __iomem *ioaddr = vp->ioaddr; - int next_tick = 60*HZ; + int next_tick = 5*HZ; int ok = 0; int media_status, old_window; @@ -1771,9 +1771,6 @@ ok = 1; } - if (!netif_carrier_ok(dev)) - next_tick = 5*HZ; - if (vp->medialock) goto leave_media_alone; - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Marvell 88E8056 gigabit ethernet controller
On Sun, 19 Aug 2007 18:15:47 -0700 (PDT) Kevin E <[EMAIL PROTECTED]> wrote: > Someone wrote me with a solution to try and so far > it's working. They suggested I try the driver up on > Marvell's website but to make sure I powered off the > machine completely and when it rebooted to not have > any of the regular kernel drivers for the Marvell > chipset to load. They had found that letting the sky2 > load and then unloading the module would mean the > vendor's driver wouldn't work. > > So I got down the latest driver package they have > (10.0.5.3). At first I couldn't get it compiled > against kernel 2.6.22.3 that I'm running, but I have > it compiled with the 2.6.21.5 kernel, which is what > the machine is running now. And I'm happy to say that > it's working fine so far. I've transfered about 4G > over the link and it's still working fine. > > Since Marvell's driver seems to be working for the > 88E8056 chipset and from what I've looked at the code > it's marked as GPL, could it be rolled into the kernel > for those of us that have 88E8056 chipsets that are > working to use? > Submit for code review, and cleanup the resulting comments. Good luck. -- Stephen Hemminger <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/4 - rev2] Add new timeval_to_sec function
On Mon, 20 Aug 2007 13:45:36 +0530 Varun Chandramohan <[EMAIL PROTECTED]> wrote: > A new function for converting timeval to time_t is added in time.h. Its a > common function used in different > places. > > Signed-off-by: Varun Chandramohan <[EMAIL PROTECTED]> > --- > include/linux/time.h | 12 > 1 files changed, 12 insertions(+), 0 deletions(-) > > diff --git a/include/linux/time.h b/include/linux/time.h > index 6a5f503..1faf65c 100644 > --- a/include/linux/time.h > +++ b/include/linux/time.h > @@ -149,6 +149,18 @@ static inline s64 timeval_to_ns(const st > } > > /** > + * timeval_to_sec - Convert timeval to seconds > + * @tv: pointer to the timeval variable to be converted > + * > + * Returns the seconds representation of timeval parameter. > + * Note : Here we round up the value. We dont need accuracy. > + */ > +static inline time_t timeval_to_sec(const struct timeval *tv) > +{ > + return (tv->tv_sec + (tv->tv_usec ? 1 : 0)); > +} > + Why roundup? Unless there is a requirement in the standard, please just use the timeval seconds. In which case the inline is unneeded. -- Stephen Hemminger <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: skb_pull_rcsum - Fatal exception in interrupt
Evgeniy Polyakov <[EMAIL PROTECTED]> wrote: > > Actually if dmesg will show that there is something in fragments, it > should use pskb_may_pull(). The same bug exist in bridge and vlan, btw, > so it might be a solution to remove bug_on from skb_pull_rcsum() and > instead call may_pull? That would be too easy :) As was the case here, the data pulled has already been accessed so calling pskb_may_pull in the pulling function is too late. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Herbert Xu wrote: On Mon, Aug 20, 2007 at 09:15:11AM -0400, Chris Snook wrote: Linus Torvalds wrote: So the only reason to add back "volatile" to the atomic_read() sequence is not to fix bugs, but to _hide_ the bugs better. They're still there, they are just a lot harder to trigger, and tend to be a lot subtler. What about barrier removal? With consistent semantics we could optimize a fair amount of code. Whether or not that constitutes "premature" optimization is open to debate, but there's no question we could reduce our register wiping in some places. If you've been reading all of Linus's emails you should be thinking about adding memory barriers, and not removing compiler barriers. He's just told you that code of the kind while (!atomic_read(cond)) ; do_something() probably needs a memory barrier (not just compiler) so that do_something() doesn't see stale cache content that occured before cond flipped. Such code generally doesn't care precisely when it gets the update, just that the update is atomic, and it doesn't loop forever. Regardless, I'm convinced we just need to do it all in assembly. -- Chris - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Christoph Lameter wrote: On Fri, 17 Aug 2007, Paul E. McKenney wrote: On Sat, Aug 18, 2007 at 08:09:13AM +0800, Herbert Xu wrote: On Fri, Aug 17, 2007 at 04:59:12PM -0700, Paul E. McKenney wrote: gcc bugzilla bug #33102, for whatever that ends up being worth. ;-) I had totally forgotten that I'd already filed that bug more than six years ago until they just closed yours as a duplicate of mine :) Good luck in getting it fixed! Well, just got done re-opening it for the third time. And a local gcc community member advised me not to give up too easily. But I must admit that I am impressed with the speed that it was identified as duplicate. Should be entertaining! ;-) Right. ROTFL... volatile actually breaks atomic_t instead of making it safe. x++ becomes a register load, increment and a register store. Without volatile we can increment the memory directly. It seems that volatile requires that the variable is loaded into a register first and then operated upon. Understandable when you think about volatile being used to access memory mapped I/O registers where a RMW operation could be problematic. So, if we want consistent behavior, we're pretty much screwed unless we use inline assembler everywhere? -- Chris - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Mon, Aug 20, 2007 at 09:15:11AM -0400, Chris Snook wrote: > Linus Torvalds wrote: > >So the only reason to add back "volatile" to the atomic_read() sequence is > >not to fix bugs, but to _hide_ the bugs better. They're still there, they > >are just a lot harder to trigger, and tend to be a lot subtler. > > What about barrier removal? With consistent semantics we could optimize a > fair amount of code. Whether or not that constitutes "premature" > optimization is open to debate, but there's no question we could reduce our > register wiping in some places. If you've been reading all of Linus's emails you should be thinking about adding memory barriers, and not removing compiler barriers. He's just told you that code of the kind while (!atomic_read(cond)) ; do_something() probably needs a memory barrier (not just compiler) so that do_something() doesn't see stale cache content that occured before cond flipped. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: LDD3 pitfalls (was Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures)
Stefan Richter wrote: Nick Piggin wrote: Stefan Richter wrote: Nick Piggin wrote: I don't know why people would assume volatile of atomics. AFAIK, most of the documentation is pretty clear that all the atomic stuff can be reordered etc. except for those that modify and return a value. Which documentation is there? Documentation/atomic_ops.txt For driver authors, there is LDD3. It doesn't specifically cover effects of optimization on accesses to atomic_t. For architecture port authors, there is Documentation/atomic_ops.txt. Driver authors also can learn something from that document, as it indirectly documents the atomic_t and bitops APIs. "Semantics and Behavior of Atomic and Bitmask Operations" is pretty direct :) Sure, it says that it's for arch maintainers, but there is no reason why users can't make use of it. Note, LDD3 page 238 says: "It is worth noting that most of the other kernel primitives dealing with synchronization, such as spinlock and atomic_t operations, also function as memory barriers." I don't know about Linux 2.6.10 against which LDD3 was written, but currently only _some_ atomic_t operations function as memory barriers. Besides, judging from some posts in this thread, saying that atomic_t operations dealt with synchronization may not be entirely precise. atomic_t is often used as the basis for implementing more sophisticated synchronization mechanisms, such as rwlocks. Whether or not they are designed for that purpose, the atomic_* operations are de facto synchronization primitives. -- Chris - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] spidernet: fix interrupt reason recognition
This patch solves a problem that the spidernet driver sometimes fails to handle IRQ. The problem happens because, - In Cell architecture, interrupts may arrive at an interrupt controller, even if they are masked by the setting on registers of devices. It happens when interrupt packets are sent just before the interrupts are masked. - spidernet interrupt handler compares interrupt reasons with interrupt masks, so when such interrupts occurs, spidernet interrupt handler returns IRQ_NONE. - When all of interrupt handler return IRQ_NONE, linux kernel disables the IRQ and it no longer delivers interrupts to the interrupt handlers. spidernet doesn't work after above sequence, because it can't receive interrupts. This patch changes spidernet interrupt handler that it compares interrupt reason with SPIDER_NET_INTX_MASK_VALUE. Signed-off-by: Kou Ishizaki <[EMAIL PROTECTED]> --- Linas-san, Please apply this to 2.6.23. Because this problem is sometimes happens and we cannot use the ethernet port any more. And also, please apply the following Arnd-san's patch to fix a problem that spidernet driver sometimes causes a BUG_ON at open. http://patchwork.ozlabs.org/cbe-oss-dev/patch?id=12211 Index: linux-powerpc-git/drivers/net/spider_net.c === --- linux-powerpc-git.orig/drivers/net/spider_net.c 2007-07-19 18:42:02.0 +0900 +++ linux-powerpc-git/drivers/net/spider_net.c 2007-08-20 20:52:23.0 +0900 @@ -1441,17 +1441,14 @@ static void spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg) { u32 error_reg1, error_reg2; - u32 mask_reg1, mask_reg2; u32 i; int show_error = 1; error_reg1 = spider_net_read_reg(card, SPIDER_NET_GHIINT1STS); error_reg2 = spider_net_read_reg(card, SPIDER_NET_GHIINT2STS); - mask_reg1 = spider_net_read_reg(card, SPIDER_NET_GHIINT1MSK); - mask_reg2 = spider_net_read_reg(card,SPIDER_NET_GHIINT2MSK); - error_reg1 &= mask_reg1; - error_reg2 &= mask_reg2; + error_reg1 &= SPIDER_NET_INT1_MASK_VALUE; + error_reg2 &= SPIDER_NET_INT2_MASK_VALUE; /* check GHIINT0STS / if (status_reg) @@ -1679,11 +1676,10 @@ spider_net_interrupt(int irq, void *ptr) { struct net_device *netdev = ptr; struct spider_net_card *card = netdev_priv(netdev); - u32 status_reg, mask_reg; + u32 status_reg; status_reg = spider_net_read_reg(card, SPIDER_NET_GHIINT0STS); - mask_reg = spider_net_read_reg(card, SPIDER_NET_GHIINT0MSK); - status_reg &= mask_reg; + status_reg &= SPIDER_NET_INT0_MASK_VALUE; if (!status_reg) return IRQ_NONE; - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/5] [TCP]: Rename tcp_ack_packets_out -> tcp_rearm_rto
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= <[EMAIL PROTECTED]> Only thing that tiny function does is rearming the RTO (if necessary), name it accordingly. Signed-off-by: Ilpo Järvinen <[EMAIL PROTECTED]> --- net/ipv4/tcp_input.c |5 ++--- 1 files changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 45ad32c..2bf3d57 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2406,8 +2406,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ - -static void tcp_ack_packets_out(struct sock *sk) +static void tcp_rearm_rto(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2560,7 +2559,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) = inet_csk(sk)->icsk_ca_ops; tcp_ack_update_rtt(sk, acked, seq_rtt); - tcp_ack_packets_out(sk); + tcp_rearm_rto(sk); if (tcp_is_reno(tp)) tcp_remove_reno_sacks(sk, pkts_acked); -- 1.5.0.6 - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/5] [TCP] MIB: Add counters for discarded SACK blocks
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= <[EMAIL PROTECTED]> In DSACK case, some events are not extraordinary, such as packet duplication generated DSACK. They can arrive easily below snd_una when undo_marker is not set (TCP being in CA_Open), counting such DSACKs amoung SACK discards will likely just mislead if they occur in some scenario when there are other problems as well. Similarly, excessively delayed packets could cause "normal" DSACKs. Therefore, separate counters are allocated for DSACK events. Signed-off-by: Ilpo Järvinen <[EMAIL PROTECTED]> --- include/linux/snmp.h |3 +++ net/ipv4/proc.c |3 +++ net/ipv4/tcp_input.c | 10 +- 3 files changed, 15 insertions(+), 1 deletions(-) diff --git a/include/linux/snmp.h b/include/linux/snmp.h index 802b3a3..d24c554 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -231,6 +231,9 @@ enum LINUX_MIB_TCPABORTONLINGER, /* TCPAbortOnLinger */ LINUX_MIB_TCPABORTFAILED, /* TCPAbortFailed */ LINUX_MIB_TCPMEMORYPRESSURES, /* TCPMemoryPressures */ + LINUX_MIB_TCPSACKDISCARD, /* TCPSACKDiscard */ + LINUX_MIB_TCPDSACKIGNOREDOLD, /* TCPSACKIgnoredOld */ + LINUX_MIB_TCPDSACKIGNOREDNOUNDO,/* TCPSACKIgnoredNoUndo */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3b690cf..986d1c8 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -244,6 +244,9 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER), SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED), SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES), + SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD), + SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD), + SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 102aefa..8692d0b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1222,8 +1222,16 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int fack_count; int dup_sack = (found_dup_sack && (i == first_sack_index)); - if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) + if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) { + if (dup_sack) { + if (!tp->undo_marker) + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); + else + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD); + } else + NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); continue; + } skb = cached_skb; fack_count = cached_fack_count; -- 1.5.0.6 - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Linus Torvalds wrote: So the only reason to add back "volatile" to the atomic_read() sequence is not to fix bugs, but to _hide_ the bugs better. They're still there, they are just a lot harder to trigger, and tend to be a lot subtler. What about barrier removal? With consistent semantics we could optimize a fair amount of code. Whether or not that constitutes "premature" optimization is open to debate, but there's no question we could reduce our register wiping in some places. -- Chris - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/5] [TCP]: tcp_packets_out_inc to tcp_output.c (no callers elsewhere)
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= <[EMAIL PROTECTED]> Signed-off-by: Ilpo Järvinen <[EMAIL PROTECTED]> --- include/net/tcp.h | 12 net/ipv4/tcp_output.c | 12 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 6d586ca..f28f382 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -614,18 +614,6 @@ static inline void tcp_dec_pcount_approx(__u32 *count, tcp_dec_pcount_approx_int(count, tcp_skb_pcount(skb)); } -static inline void tcp_packets_out_inc(struct sock *sk, - const struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - int orig = tp->packets_out; - - tp->packets_out += tcp_skb_pcount(skb); - if (!orig) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); -} - /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1d65ce1..a61a3e3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -61,6 +61,18 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +static inline void tcp_packets_out_inc(struct sock *sk, + const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + int orig = tp->packets_out; + + tp->packets_out += tcp_skb_pcount(skb); + if (!orig) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); +} + static void update_send_head(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); -- 1.5.0.6 - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/5] [TCP]: Remove unnecessary wrapper tcp_packets_out_dec
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= <[EMAIL PROTECTED]> Makes caller side more obvious, there's no need to have a wrapper for this oneliner! Signed-off-by: Ilpo Järvinen <[EMAIL PROTECTED]> --- include/net/tcp.h |6 -- net/ipv4/tcp_input.c |2 +- net/ipv4/tcp_output.c |2 +- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 7c65989..6d586ca 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -626,12 +626,6 @@ static inline void tcp_packets_out_inc(struct sock *sk, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } -static inline void tcp_packets_out_dec(struct tcp_sock *tp, - const struct sk_buff *skb) -{ - tp->packets_out -= tcp_skb_pcount(skb); -} - /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 96ced89..45ad32c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2548,7 +2548,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) last_ackt = skb->tstamp; } tcp_dec_pcount_approx(&tp->fackets_out, skb); - tcp_packets_out_dec(tp, skb); + tp->packets_out -= tcp_skb_pcount(skb); tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); clear_all_retrans_hints(tp); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a367917..1d65ce1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1735,7 +1735,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m * it is better to underestimate fackets. */ tcp_dec_pcount_approx(&tp->fackets_out, next_skb); - tcp_packets_out_dec(tp, next_skb); + tp->packets_out -= tcp_skb_pcount(next_skb); sk_stream_free_skb(sk, next_skb); } } -- 1.5.0.6 - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-2.6.24 0/5] TCP: Cleanups & SACK block validation
Hi Dave, Here are couple of patches to net-2.6.24. The first three are trivial cleanups. The idea to the last two comes from tcp-2.6 but the validator has been heavily modified (and hopefully improved in the process :-)). I'm not sure though if checking for the undo_marker boundary crossing case is a bit over-engineering (inherited from the original version which already checked for that case). In addition, better names could be invented for MIBs, suggestions? -- i. - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/5] [TCP]: Discard fuzzy SACK blocks
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= <[EMAIL PROTECTED]> SACK processing code has been a sort of russian roulette as no validation of SACK blocks is previously attempted. Besides, it is not very clear what all kinds of broken SACK blocks really mean (e.g., one that has start and end sequence numbers reversed). So now close the roulette once and for all. Signed-off-by: Ilpo Järvinen <[EMAIL PROTECTED]> --- net/ipv4/tcp_input.c | 82 ++ 1 files changed, 82 insertions(+), 0 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2bf3d57..102aefa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1001,7 +1001,86 @@ static void tcp_update_reordering(struct sock *sk, const int metric, *for retransmitted and already SACKed segment -> reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. + * + * SACK block validation. + * -- + * + * SACK block range validation checks that the received SACK block fits to + * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. + * Note that SND.UNA is not included to the range though being valid because + * it means that the receiver is rather inconsistent with itself (reports + * SACK reneging when it should advance SND.UNA). + * + * Implements also blockage to start_seq wrap-around. Problem lies in the + * fact that though start_seq (s) is before end_seq (i.e., not reversed), + * there's no guarantee that it will be before snd_nxt (n). The problem + * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt + * wrap (s_w): + * + * <- outs wnd -> <- wrapzone -> + * u e n u_w e_w s n_w + * | | | | | | | + * |<+--+- TCP seqno space --+-->| + * ...-- <2^31 ->| |<... + * ... >2^31 -->||<... + * + * Current code wouldn't be vulnerable but it's better still to discard such + * crazy SACK blocks. Doing this check for start_seq alone closes somewhat + * similar case (end_seq after snd_nxt wrap) as earlier reversed check in + * snd_nxt wrap -> snd_una region will then become "well defined", i.e., + * equal to the ideal case (infinite seqno space without wrap caused issues). + * + * With D-SACK the lower bound is extended to cover sequence space below + * SND.UNA down to undo_marker, which is the last point of interest. Yet + * again, DSACK block must not to go across snd_una (for the same reason as + * for the normal SACK blocks, explained above). But there all simplicity + * ends, TCP might receive valid D-SACKs below that. As long as they reside + * fully below undo_marker they do not affect behavior in anyway and can + * therefore be safely ignored. In rare cases (which are more or less + * theoretical ones), the D-SACK will nicely cross that boundary due to skb + * fragmentation and packet reordering past skb's retransmission. To consider + * them correctly, the acceptable range must be extended even more though + * the exact amount is rather hard to quantify. However, tp->max_window can + * be used as an exaggerated estimate. */ +static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, + u32 start_seq, u32 end_seq) +{ + /* Too far in future, or reversed (interpretation is ambiguous) */ + if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) + return 0; + + /* Nasty start_seq wrap-around check (see comments above) */ + if (!before(start_seq, tp->snd_nxt)) + return 0; + + /* In outstanding window? ...This is valid exit for DSACKs too. +* start_seq == snd_una is non-sensical (see comments above) +*/ + if (after(start_seq, tp->snd_una)) + return 1; + + if (!is_dsack || !tp->undo_marker) + return 0; + + /* ...Then it's D-SACK, and must reside below snd_una completely */ + if (!after(end_seq, tp->snd_una)) + return 0; + + if (!before(start_seq, tp->undo_marker)) + return 1; + + /* Too old */ + if (!after(end_seq, tp->undo_marker)) + return 0; + + /* Undo_marker boundary crossing (overestimates a lot). Known already: +* start_seq < undo_marker and end_seq >= undo_marker. +*/ + return !before(start_seq, end_seq - tp->max_window); +} + + static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) @@ -1143,6 +1222,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int
Re: [RFC] restore netdev_priv optimization
Hi, David Miller wrote: From: Stephen Hemminger <[EMAIL PROTECTED]> Date: Fri, 17 Aug 2007 15:40:22 -0700 Compile tested only!!! Obviously. The first loopback transmit is guarenteed to crash. [...] And this also breaks loopback again, which uses a static struct netdev in the kernel image, it doesn't use alloc_netdev(), so egress_subqueue of loopback will be NULL. Talking about loopback, don't you think it could be the right time to make it behave like any other kind of net devices, and allocate it dynamically. Having a dynamically allocated loopback could make maintenance easier (removing special cases). Also this is something we'll need to support multiple loopbacks for example for network namespaces. Eric Biederman has written a nice patch that does this. I'm using it on 2.6.23-rc2. Benjamin -- B e n j a m i n T h e r y - BULL/DT/Open Software R&D http://www.bull.com - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCPportsfrom the host TCP port space.
"Felix Marti" <[EMAIL PROTECTED]> writes: > > avoidance gains of TSO and LRO are still a very worthwhile savings. > So, i.e. with TSO, your saving about 16 headers (let us say 14 + 20 + > 20), 864B, when moving ~64KB of payload - looks like very much in the > noise to me. TSO is beneficial for the software again. The linux code currently takes several locks and does quite a few function calls for each packet and using larger packets lowers this overhead. At least with 10GbE saving CPU cycles is still quite important. > an option to get 'high performance' Shouldn't you qualify that? It is unlikely you really duplicated all the tuning for corner cases that went over many years into good software TCP stacks in your hardware. So e.g. for wide area networks with occasional packet loss the software might well perform better. -Andi - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCPportsfrom the host TCP port space.
On Sun, Aug 19, 2007 at 05:47:59PM -0700, Felix Marti ([EMAIL PROTECTED]) wrote: > [Felix Marti] David and Herbert, so you agree that the user<>kernel > space memory copy overhead is a significant overhead and we want to > enable zero-copy in both the receive and transmit path? - Yes, copy It depends. If you need to access that data after received, you will get cache miss and performance will not be much better (if any) that with copy. > avoidance is mainly an API issue and unfortunately the so widely used > (synchronous) sockets API doesn't make copy avoidance easy, which is one > area where protocol offload can help. Yes, some apps can resort to > sendfile() but there are many apps which seem to have trouble switching > to that API... and what about the receive path? There is number of implementations, and all they are suitable for is to have recvfile(), since this is likely the only case, which can work without cache. And actually RDMA stack exist and no one said it should be thrown away _until_ it messes with main stack. It started to speal ports. What will happen when it gest all port space and no new legal network conection can be opened, although there is no way to show to user who got it? What will happen if hardware RDMA connection got terminated and software could not free the port? Will RDMA request to export connection reset functions out of stack to drop network connections which are on the ports which are supposed to be used by new RDMA connections? RDMA is not a problem, but how it influence to the network stack is. Let's better think about how to work correctly with network stack (since we already have that cr^Wdifferent hardware) instead of saying that others do bad work and do not allow shiny new feature to exist. -- Evgeniy Polyakov - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: PROBLEM: 2.6.23-rc "NETDEV WATCHDOG: eth0: transmit timed out"
The error exists from patch 2 on. I did some network testing with patch 1 and currently use it and have no errors so far. >From my experiences up to now patch 1 should be error free. 2007/8/16, Francois Romieu <[EMAIL PROTECTED]>: > (please do not remove the netdev Cc:) > > Francois Romieu <[EMAIL PROTECTED]> : > [...] > > If it does not work I'll dissect 0e4851502f846b13b29b7f88f1250c980d57e944 > > tomorrow. > > You will find a tgz archive in attachment which contains a serie of patches > (0001-... to 0005-...) to walk from 6dccd16b7c2703e8bbf8bca62b5cf248332afbe2 > to 0e4851502f846b13b29b7f88f1250c980d57e944 in smaller steps. > > Please apply 0001 on top of 6dccd16b7c2703e8bbf8bca62b5cf248332afbe2. If it > still works, apply 0002 on top of 0001, etc. > > -- > Ueimor > > - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFT] r8169 changes against 2.6.23-rc3
On 8/19/07, Bruce Cole <[EMAIL PROTECTED]> wrote: > So it seems that when the driver tries to queue a packet while the > controller is busy processing the queue, the newly queued packet does > not get noticed by the controller (until further packet activity occurs). > Perhaps there is a problem with the memory barriers when adding to the > TX queue, but I'm a newbie on linux kernel memory barriers. One thing I noticed a while ago (march) is that floodpinging (ping -f) the r8169 host from an external system also increases performance without changing code. My original post about the problem: http://marc.info/?l=linux-netdev&m=117207362010321&w=2 I ended up (until now perhaps :-) with disabling the onboard nic and adding an e1000 card. Kind regards, Dirk - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/4 - rev2] Initialize and fill IPv6 route age
The age field of the ipv6 route structures are initilized with the current timeval at the time of route creation. When the route dump is called the route age value stored in the structure is subtracted from the present timeval and the difference is passed on as the route age. Signed-off-by: Varun Chandramohan <[EMAIL PROTECTED]> --- include/net/ip6_fib.h |1 + include/net/ip6_route.h |3 +++ net/ipv6/addrconf.c |5 + net/ipv6/route.c| 24 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c48ea87..e30a1cf 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -98,6 +98,7 @@ struct rt6_info u32 rt6i_flags; u32 rt6i_metric; + time_t rt6i_age; atomic_trt6i_ref; struct fib6_table *rt6i_table; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 5456fdd..fc9716c 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -36,6 +36,9 @@ struct route_info { #define RT6_LOOKUP_F_REACHABLE 0x2 #define RT6_LOOKUP_F_HAS_SADDR 0x4 +#define RT6_SET_ROUTE_INFO 0x0 +#define RT6_GET_ROUTE_INFO 0x1 + extern struct rt6_info ip6_null_entry; #ifdef CONFIG_IPV6_MULTIPLE_TABLES diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 91ef3be..666ec28 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4182,6 +4182,7 @@ EXPORT_SYMBOL(unregister_inet6addr_notif int __init addrconf_init(void) { + struct timeval tv; int err = 0; /* The addrconf netdev notifier requires that loopback_dev @@ -4209,10 +4210,14 @@ int __init addrconf_init(void) if (err) return err; + do_gettimeofday(&tv); ip6_null_entry.rt6i_idev = in6_dev_get(&loopback_dev); + ip6_null_entry.rt6i_age = timeval_to_sec(&tv); #ifdef CONFIG_IPV6_MULTIPLE_TABLES ip6_prohibit_entry.rt6i_idev = in6_dev_get(&loopback_dev); + ip6_prohibit_entry.rt6i_age = timeval_to_sec(&tv); ip6_blk_hole_entry.rt6i_idev = in6_dev_get(&loopback_dev); + ip6_blk_hole_entry.rt6i_age = timeval_to_sec(&tv); #endif register_netdevice_notifier(&ipv6_dev_notf); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 55ea80f..9df756c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -600,7 +600,14 @@ static int __ip6_ins_rt(struct rt6_info { int err; struct fib6_table *table; + struct timeval tv; + do_gettimeofday(&tv); + /* Update the timeval for new routes +* We add it here to make it common irrespective +* of how the new route is added. +*/ + rt->rt6i_age = timeval_to_sec(&tv); table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); err = fib6_add(&table->tb6_root, rt, info); @@ -2112,6 +2119,7 @@ static inline size_t rt6_nlmsg_size(void + nla_total_size(4) /* RTA_IIF */ + nla_total_size(4) /* RTA_OIF */ + nla_total_size(4) /* RTA_PRIORITY */ + + nla_total_size(4) /*RTA_AGE*/ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)); } @@ -2119,10 +2127,11 @@ static inline size_t rt6_nlmsg_size(void static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, struct in6_addr *dst, struct in6_addr *src, int iif, int type, u32 pid, u32 seq, -int prefix, unsigned int flags) +int prefix, unsigned int flags, int dumpflg) { struct rtmsg *rtm; struct nlmsghdr *nlh; + struct timeval tv; long expires; u32 table; @@ -2186,6 +2195,13 @@ static int rt6_fill_node(struct sk_buff if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0) NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); } + + if (dumpflg) { + do_gettimeofday(&tv); + NLA_PUT_U32(skb, RTA_AGE, timeval_to_sec(&tv) - rt->rt6i_age); + } else { + NLA_PUT_U32(skb, RTA_AGE, rt->rt6i_age); + } if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) goto nla_put_failure; @@ -2223,7 +2239,7 @@ int rt6_dump_route(struct rt6_info *rt, return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, -prefix, NLM_F_MULTI); +prefix, NLM_F_MULTI, RT6_GET_ROUTE_INFO); } static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) @@ -2288,7 +2304,7 @@ static int inet6_rtm_getroute(struct sk_ err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
[PATCH 2/4 - rev2] Add new timeval_to_sec function
A new function for converting timeval to time_t is added in time.h. Its a common function used in different places. Signed-off-by: Varun Chandramohan <[EMAIL PROTECTED]> --- include/linux/time.h | 12 1 files changed, 12 insertions(+), 0 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index 6a5f503..1faf65c 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -149,6 +149,18 @@ static inline s64 timeval_to_ns(const st } /** + * timeval_to_sec - Convert timeval to seconds + * @tv: pointer to the timeval variable to be converted + * + * Returns the seconds representation of timeval parameter. + * Note : Here we round up the value. We dont need accuracy. + */ +static inline time_t timeval_to_sec(const struct timeval *tv) +{ + return (tv->tv_sec + (tv->tv_usec ? 1 : 0)); +} + +/** * ns_to_timespec - Convert nanoseconds to timespec * @nsec: the nanoseconds value to be converted * -- 1.4.3.4 - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/4 - rev 2] Initilize and populate age field
The age field is filled with the current time at the time of creation of the route. When the routes are dumped then the age value stored in the route structure is subtracted from the current time value and the difference is the age expressed in secs. Signed-off-by: Varun Chandramohan <[EMAIL PROTECTED]> --- net/ipv4/fib_hash.c |3 +++ net/ipv4/fib_lookup.h|3 ++- net/ipv4/fib_semantics.c | 16 +--- net/ipv4/fib_trie.c |1 + 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 9ad1d9f..228ab27 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -448,6 +448,7 @@ static int fn_hash_insert(struct fib_tab fa->fa_info = fi; fa->fa_type = cfg->fc_type; fa->fa_scope = cfg->fc_scope; + fa->fa_age = 0; state = fa->fa_state; fa->fa_state &= ~FA_S_ACCESSED; fib_hash_genid++; @@ -507,6 +508,7 @@ static int fn_hash_insert(struct fib_tab new_fa->fa_type = cfg->fc_type; new_fa->fa_scope = cfg->fc_scope; new_fa->fa_state = 0; + new_fa->fa_age = 0; /* * Insert new entry to the list. @@ -697,6 +699,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, f->fn_key, fz->fz_order, fa->fa_tos, + &fa->fa_age, fa->fa_info, NLM_F_MULTI) < 0) { cb->args[4] = i; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index eef9eec..c9145b5 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -13,6 +13,7 @@ struct fib_alias { u8 fa_type; u8 fa_scope; u8 fa_state; + time_t fa_age; }; #define FA_S_ACCESSED 0x01 @@ -27,7 +28,7 @@ extern struct fib_info *fib_create_info( extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, u8 type, u8 scope, __be32 dst, -int dst_len, u8 tos, struct fib_info *fi, +int dst_len, u8 tos, time_t *age, struct fib_info *fi, unsigned int); extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, struct nl_info *info, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c434119..1822d92 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -278,7 +278,8 @@ static inline size_t fib_nlmsg_size(stru + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_DST */ + nla_total_size(4) /* RTA_PRIORITY */ -+ nla_total_size(4); /* RTA_PREFSRC */ ++ nla_total_size(4) /* RTA_PREFSRC */ ++ nla_total_size(4); /*RTA_AGE*/ /* space for nested metrics */ payload += nla_total_size((RTAX_MAX * nla_total_size(4))); @@ -313,7 +314,7 @@ void rtmsg_fib(int event, __be32 key, st err = fib_dump_info(skb, info->pid, seq, event, tb_id, fa->fa_type, fa->fa_scope, key, dst_len, - fa->fa_tos, fa->fa_info, nlm_flags); + fa->fa_tos, &fa->fa_age, fa->fa_info, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -940,11 +941,12 @@ __be32 __fib_res_prefsrc(struct fib_resu } int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, + u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, time_t *age, struct fib_info *fi, unsigned int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; + struct timeval tv; nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags); if (nlh == NULL) @@ -985,6 +987,14 @@ int fib_dump_info(struct sk_buff *skb, u NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); #endif } + + do_gettimeofday(&tv); + if (!*age) { + *age = timeval_to_sec(&tv); + NLA_PUT_U32(skb, RTA_AGE, *age); + } else { + NLA_PUT_U32(skb, RTA_AGE, timeval_to_sec(&tv) - *age); + } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fi->fib_nhs > 1) { struct rtnexthop *rtnh; diff --git a/net/ipv4/fib_trie.c b/net/ipv4
[PATCH 1/4 - rev2] New attribute RTA_AGE
A new attribute RTA_AGE is added for the age value to be exported to userlevel using netlink Signed-off-by: Varun Chandramohan <[EMAIL PROTECTED]> --- include/linux/rtnetlink.h |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index c91476c..68046a4 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -263,6 +263,7 @@ enum rtattr_type_t RTA_SESSION, RTA_MP_ALGO, /* no longer used */ RTA_TABLE, + RTA_AGE, __RTA_MAX }; -- 1.4.3.4 - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/4 - rev 2] Age Entry For IPv4 & IPv6 Route Table
Hi Dave, This is rev2 of the patch set i sent out sometime ago. I have made it against net-2.6.24 tree. Can you please review and let me know? There have been a few minor changes since rev1. Original Message: According to the RFC 4292 (IP Forwarding Table MIB) there is a need for an age entry for all the routes in therouting table. The entry in the RFC is inetCidrRouteAge and oid is inetCidrRouteAge.1.10. Many snmp application require this age entry. So iam adding the age field in the routing table for ipv4 and ipv6 and providing the interface for this value netlink. Signed-off-by: Varun Chandramohan <[EMAIL PROTECTED]> --- - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html