date:20060202

[PATCH 6/6 revised] bnx2: Update version

2006-02-02 Thread Michael Chan

Update version to 1.4.37.

Add missing flush_scheduled_work() in bnx2_suspend as noted by Jeff
Garzik.

Signed-off-by: Michael Chan <[EMAIL PROTECTED]>


diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index ee9f58f..630281b 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -14,8 +14,8 @@
 
 #define DRV_MODULE_NAME"bnx2"
 #define PFX DRV_MODULE_NAME": "
-#define DRV_MODULE_VERSION "1.4.31"
-#define DRV_MODULE_RELDATE "January 19, 2006"
+#define DRV_MODULE_VERSION "1.4.37"
+#define DRV_MODULE_RELDATE "February 1, 2006"
 
 #define RUN_AT(x) (jiffies + (x))
 
@@ -5765,6 +5765,7 @@ bnx2_suspend(struct pci_dev *pdev, pm_me
if (!netif_running(dev))
return 0;
 
+   flush_scheduled_work();
bnx2_netif_stop(bp);
netif_device_detach(dev);
del_timer_sync(&bp->timer);


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/6 revised] bnx2: Support larger rx ring sizes (part 2)

2006-02-02 Thread Michael Chan

Support bigger rx ring sizes (up to 1020) in the rx fast path.

Signed-off-by: Michael Chan <[EMAIL PROTECTED]>


diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index fbe7a14..3fbf414 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -1687,8 +1687,8 @@ bnx2_reuse_rx_skb(struct bnx2 *bp, struc
pci_unmap_addr_set(prod_rx_buf, mapping,
pci_unmap_addr(cons_rx_buf, mapping));
 
-   cons_bd = &bp->rx_desc_ring[cons];
-   prod_bd = &bp->rx_desc_ring[prod];
+   cons_bd = &bp->rx_desc_ring[RX_RING(cons)][RX_IDX(cons)];
+   prod_bd = &bp->rx_desc_ring[RX_RING(prod)][RX_IDX(prod)];
prod_bd->rx_bd_haddr_hi = cons_bd->rx_bd_haddr_hi;
prod_bd->rx_bd_haddr_lo = cons_bd->rx_bd_haddr_lo;
 }


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/6 revised] bnx2: Support larger rx ring sizes (part 1)

2006-02-02 Thread Michael Chan

Increase maximum receive ring size from 255 to 1020 by supporting
up to 4 linked pages of receive descriptors. To accomodate the
higher memory usage, each physical descriptor page is allocated
separately and the software ring that keeps track of the SKBs and the
DMA addresses is allocated using vmalloc.

Some of the receive-related fields in the bp structure are re-
organized a bit for better locality of reference.

The max. was reduced to 1020 from 4080 after discussion with David
Miller.

This patch contains ring init code changes only. This next patch
contains rx data path code changes.

Signed-off-by: Michael Chan <[EMAIL PROTECTED]>


diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 27ad2f7..547c491 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -360,6 +360,8 @@ bnx2_netif_start(struct bnx2 *bp)
 static void
 bnx2_free_mem(struct bnx2 *bp)
 {
+   int i;
+
if (bp->stats_blk) {
pci_free_consistent(bp->pdev, sizeof(struct statistics_block),
bp->stats_blk, bp->stats_blk_mapping);
@@ -378,19 +380,23 @@ bnx2_free_mem(struct bnx2 *bp)
}
kfree(bp->tx_buf_ring);
bp->tx_buf_ring = NULL;
-   if (bp->rx_desc_ring) {
-   pci_free_consistent(bp->pdev,
-   sizeof(struct rx_bd) * RX_DESC_CNT,
-   bp->rx_desc_ring, bp->rx_desc_mapping);
-   bp->rx_desc_ring = NULL;
+   for (i = 0; i < bp->rx_max_ring; i++) {
+   if (bp->rx_desc_ring[i])
+   pci_free_consistent(bp->pdev,
+   sizeof(struct rx_bd) * RX_DESC_CNT,
+   bp->rx_desc_ring[i],
+   bp->rx_desc_mapping[i]);
+   bp->rx_desc_ring[i] = NULL;
}
-   kfree(bp->rx_buf_ring);
+   vfree(bp->rx_buf_ring);
bp->rx_buf_ring = NULL;
 }
 
 static int
 bnx2_alloc_mem(struct bnx2 *bp)
 {
+   int i;
+
bp->tx_buf_ring = kmalloc(sizeof(struct sw_bd) * TX_DESC_CNT,
 GFP_KERNEL);
if (bp->tx_buf_ring == NULL)
@@ -404,18 +410,23 @@ bnx2_alloc_mem(struct bnx2 *bp)
if (bp->tx_desc_ring == NULL)
goto alloc_mem_err;
 
-   bp->rx_buf_ring = kmalloc(sizeof(struct sw_bd) * RX_DESC_CNT,
-GFP_KERNEL);
+   bp->rx_buf_ring = vmalloc(sizeof(struct sw_bd) * RX_DESC_CNT *
+ bp->rx_max_ring);
if (bp->rx_buf_ring == NULL)
goto alloc_mem_err;
 
-   memset(bp->rx_buf_ring, 0, sizeof(struct sw_bd) * RX_DESC_CNT);
-   bp->rx_desc_ring = pci_alloc_consistent(bp->pdev,
-   sizeof(struct rx_bd) *
-   RX_DESC_CNT,
-   &bp->rx_desc_mapping);
-   if (bp->rx_desc_ring == NULL)
-   goto alloc_mem_err;
+   memset(bp->rx_buf_ring, 0, sizeof(struct sw_bd) * RX_DESC_CNT *
+  bp->rx_max_ring);
+
+   for (i = 0; i < bp->rx_max_ring; i++) {
+   bp->rx_desc_ring[i] =
+   pci_alloc_consistent(bp->pdev,
+sizeof(struct rx_bd) * RX_DESC_CNT,
+&bp->rx_desc_mapping[i]);
+   if (bp->rx_desc_ring[i] == NULL)
+   goto alloc_mem_err;
+
+   }
 
bp->status_blk = pci_alloc_consistent(bp->pdev,
  sizeof(struct status_block),
@@ -1520,7 +1531,7 @@ bnx2_alloc_rx_skb(struct bnx2 *bp, u16 i
struct sk_buff *skb;
struct sw_bd *rx_buf = &bp->rx_buf_ring[index];
dma_addr_t mapping;
-   struct rx_bd *rxbd = &bp->rx_desc_ring[index];
+   struct rx_bd *rxbd = &bp->rx_desc_ring[RX_RING(index)][RX_IDX(index)];
unsigned long align;
 
skb = dev_alloc_skb(bp->rx_buf_size);
@@ -3349,24 +3360,32 @@ bnx2_init_rx_ring(struct bnx2 *bp)
bp->hw_rx_cons = 0;
bp->rx_prod_bseq = 0;

-   rxbd = &bp->rx_desc_ring[0];
-   for (i = 0; i < MAX_RX_DESC_CNT; i++, rxbd++) {
-   rxbd->rx_bd_len = bp->rx_buf_use_size;
-   rxbd->rx_bd_flags = RX_BD_FLAGS_START | RX_BD_FLAGS_END;
-   }
+   for (i = 0; i < bp->rx_max_ring; i++) {
+   int j;
 
-   rxbd->rx_bd_haddr_hi = (u64) bp->rx_desc_mapping >> 32;
-   rxbd->rx_bd_haddr_lo = (u64) bp->rx_desc_mapping & 0x;
+   rxbd = &bp->rx_desc_ring[i][0];
+   for (j = 0; j < MAX_RX_DESC_CNT; j++, rxbd++) {
+   rxbd->rx_bd_len = bp->rx_buf_use_size;
+   rxbd->rx_bd_flags = RX_BD_FLAGS_START | RX_BD_FLAGS_END;
+   }
+   if (i == (bp->rx_max_ring

[PATCH 3/6 revised] bnx2: Fix bug when rx ring is full

2006-02-02 Thread Michael Chan

Fix the rx code path that does not handle the full rx ring correctly.

When the rx ring is set to the max. size (i.e. 255), the consumer and
producer indices will be the same when completing an rx packet. Fix
the rx code to handle this condition properly.

Signed-off-by: Michael Chan <[EMAIL PROTECTED]>


diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 7560893..27ad2f7 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -1656,23 +1656,30 @@ static inline void
 bnx2_reuse_rx_skb(struct bnx2 *bp, struct sk_buff *skb,
u16 cons, u16 prod)
 {
-   struct sw_bd *cons_rx_buf = &bp->rx_buf_ring[cons];
-   struct sw_bd *prod_rx_buf = &bp->rx_buf_ring[prod];
-   struct rx_bd *cons_bd = &bp->rx_desc_ring[cons];
-   struct rx_bd *prod_bd = &bp->rx_desc_ring[prod];
+   struct sw_bd *cons_rx_buf, *prod_rx_buf;
+   struct rx_bd *cons_bd, *prod_bd;
+
+   cons_rx_buf = &bp->rx_buf_ring[cons];
+   prod_rx_buf = &bp->rx_buf_ring[prod];
 
pci_dma_sync_single_for_device(bp->pdev,
pci_unmap_addr(cons_rx_buf, mapping),
bp->rx_offset + RX_COPY_THRESH, PCI_DMA_FROMDEVICE);
 
-   prod_rx_buf->skb = cons_rx_buf->skb;
-   pci_unmap_addr_set(prod_rx_buf, mapping,
-   pci_unmap_addr(cons_rx_buf, mapping));
+   bp->rx_prod_bseq += bp->rx_buf_use_size;
 
-   memcpy(prod_bd, cons_bd, 8);
+   prod_rx_buf->skb = skb;
 
-   bp->rx_prod_bseq += bp->rx_buf_use_size;
+   if (cons == prod)
+   return;
 
+   pci_unmap_addr_set(prod_rx_buf, mapping,
+   pci_unmap_addr(cons_rx_buf, mapping));
+
+   cons_bd = &bp->rx_desc_ring[cons];
+   prod_bd = &bp->rx_desc_ring[prod];
+   prod_bd->rx_bd_haddr_hi = cons_bd->rx_bd_haddr_hi;
+   prod_bd->rx_bd_haddr_lo = cons_bd->rx_bd_haddr_lo;
 }
 
 static int
@@ -1699,14 +1706,19 @@ bnx2_rx_int(struct bnx2 *bp, int budget)
u32 status;
struct sw_bd *rx_buf;
struct sk_buff *skb;
+   dma_addr_t dma_addr;
 
sw_ring_cons = RX_RING_IDX(sw_cons);
sw_ring_prod = RX_RING_IDX(sw_prod);
 
rx_buf = &bp->rx_buf_ring[sw_ring_cons];
skb = rx_buf->skb;
-   pci_dma_sync_single_for_cpu(bp->pdev,
-   pci_unmap_addr(rx_buf, mapping),
+
+   rx_buf->skb = NULL;
+
+   dma_addr = pci_unmap_addr(rx_buf, mapping);
+
+   pci_dma_sync_single_for_cpu(bp->pdev, dma_addr,
bp->rx_offset + RX_COPY_THRESH, PCI_DMA_FROMDEVICE);
 
rx_hdr = (struct l2_fhdr *) skb->data;
@@ -1747,8 +1759,7 @@ bnx2_rx_int(struct bnx2 *bp, int budget)
skb = new_skb;
}
else if (bnx2_alloc_rx_skb(bp, sw_ring_prod) == 0) {
-   pci_unmap_single(bp->pdev,
-   pci_unmap_addr(rx_buf, mapping),
+   pci_unmap_single(bp->pdev, dma_addr,
bp->rx_buf_use_size, PCI_DMA_FROMDEVICE);
 
skb_reserve(skb, bp->rx_offset);
@@ -1794,8 +1805,6 @@ reuse_rx:
rx_pkt++;
 
 next_rx:
-   rx_buf->skb = NULL;
-
sw_cons = NEXT_RX_BD(sw_cons);
sw_prod = NEXT_RX_BD(sw_prod);
 
@@ -3360,7 +3369,7 @@ bnx2_init_rx_ring(struct bnx2 *bp)
val = (u64) bp->rx_desc_mapping & 0x;
CTX_WR(bp, GET_CID_ADDR(RX_CID), BNX2_L2CTX_NX_BDHADDR_LO, val);
 
-   for ( ;ring_prod < bp->rx_ring_size; ) {
+   for (i = 0; i < bp->rx_ring_size; i++) {
if (bnx2_alloc_rx_skb(bp, ring_prod) < 0) {
break;
}


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/4] bnx2: Support larger rx ring sizes

2006-02-02 Thread Michael Chan

On Thu, 2006-02-02 at 23:29 -0800, David S. Miller wrote:

> While I have your attention, I think we should disable TSO
> by default on capable chips in the tg3 driver.  What's your
> take on that?
> 
You mean enable? Yes, I think we should do that (enable TSO).

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/4] bnx2: Support larger rx ring sizes

2006-02-02 Thread David S. Miller

From: "Michael Chan" <[EMAIL PROTECTED]>
Date: Thu, 02 Feb 2006 21:46:39 -0800

> I'll be re-submitting patches 3 and 4. Patch #3 will be capped at 1020
> descriptors and broken up into 3 smaller patches. Please review.

I will, thanks Michael.

While I have your attention, I think we should disable TSO
by default on capable chips in the tg3 driver.  What's your
take on that?

Thanks.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 3/4] bnx2: Support larger rx ring sizes

2006-02-02 Thread Michael Chan

On Tue, 2006-01-31 at 22:11 -0800, David S. Miller wrote:
> From: "Michael Chan" <[EMAIL PROTECTED]>
> Date: Tue, 31 Jan 2006 21:58:23 -0800
> 
> > How about 1024 (actually 1020)? tg3 has a max. of 511 standard plus
> > 255 jumbo rx descriptors. bnx2 runs at a higher maximum line speed
> > of 2.5Gbps so I think it's reasonable to allow a little more for bnx2.
> 
> Ok.

I'll be re-submitting patches 3 and 4. Patch #3 will be capped at 1020
descriptors and broken up into 3 smaller patches. Please review.

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: badness in dst_release

2006-02-02 Thread David S. Miller

From: Stephen Hemminger <[EMAIL PROTECTED]>
Date: Thu, 2 Feb 2006 19:35:25 -0800

> I triggered this easily to day. will bisect tomorrow.

Thanks a lot Stephen.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] snap: needs hardware checksum fix

2006-02-02 Thread David S. Miller

From: Herbert Xu <[EMAIL PROTECTED]>
Date: Fri, 03 Feb 2006 12:26:32 +1100

> David S. Miller <[EMAIL PROTECTED]> wrote:
> > 
> > This patch made me notice that the length is sort of implicit
> > or can be calculated given "start" and the current skb->data
> > value.
> > 
> > Someone might want to look into making that simplification
> > at some point.
> 
> Or we could simply merge skb_pull and skb_postpull_rcsum into one
> function that does both.

True.  There aren't that many skb_postpull_rcsum() call sites
at the moment so the changes should be quite easy.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] add CONFIG_NETDEBUG to suppress bad packet messages

2006-02-02 Thread David S. Miller

From: Dave Jones <[EMAIL PROTECTED]>
Date: Thu, 2 Feb 2006 23:23:03 -0500

> On Thu, Feb 02, 2006 at 04:35:01PM -0800, Stephen Hemminger wrote:
>  > If you are on a hostile network, or are running protocol tests, you can
>  > easily get the logged swamped by messages about bad UDP and ICMP packets.
>  > This turns those messages off unless a config option is enabled.
>  > 
>  > Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]>
> 
> Heh, I was toying with something similar when I started playing with that
> test tool, but couldn't decide on this approach or toying with the rate 
> limiting.
> Definitly worth an option though imo.

Ok, I guess this isn't such a bad idea after all.

I'll apply Stephen's patch.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] add CONFIG_NETDEBUG to suppress bad packet messages

2006-02-02 Thread Dave Jones

On Thu, Feb 02, 2006 at 04:47:03PM -0800, David S. Miller wrote:
 > From: Stephen Hemminger <[EMAIL PROTECTED]>
 > Date: Thu, 2 Feb 2006 16:35:01 -0800
 > 
 > > If you are on a hostile network, or are running protocol tests, you can
 > > easily get the logged swamped by messages about bad UDP and ICMP packets.
 > > This turns those messages off unless a config option is enabled.
 > > 
 > > Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]>
 > 
 > NETDEBUG should print out something by default.
 > 
 > We should fix the NETDEBUG() users.  Dave Jones recently fixed
 > a case in IGMP, for example.
 > 
 > It should print out messages for cases that are impossible and really
 > need investigation, and not for cases that can be triggered by random
 > packets being sent from a remote system.

There's a number of cases that are still way too easy to trigger.
Looking at the box currently taking abuse..

UDP: short packet: From 192.168.79.115:46186 21196/1168 to 192.168.76.106:23453
UDP: short packet: From 192.168.79.115:38661 53808/1148 to 192.168.76.106:61471
UDP: bad checksum. From 192.168.79.115:28041 to 192.168.76.106:49667 ulen 245
UDP: bad checksum. From 192.168.79.115:45103 to 192.168.76.106:3621 ulen 145
192.168.79.115 sent an invalid ICMP type 11, code 171 error to a broadcast: 
242.55.217.243 on eth0
svc: bad direction 1161958909, dropping request
ICMP: 160.23.75.159: Source Route Failed.
ICMP: 17.71.42.69: Source Route Failed.
ICMP: 136.227.103.241: Source Route Failed.

and a few thousand other similar entries..

Some (all?) of these are already subject to net_ratelimit(), but on a fast
enough network it's more or less useless right now.

Dave

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] add CONFIG_NETDEBUG to suppress bad packet messages

2006-02-02 Thread Dave Jones

On Thu, Feb 02, 2006 at 04:35:01PM -0800, Stephen Hemminger wrote:
 > If you are on a hostile network, or are running protocol tests, you can
 > easily get the logged swamped by messages about bad UDP and ICMP packets.
 > This turns those messages off unless a config option is enabled.
 > 
 > Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]>

Heh, I was toying with something similar when I started playing with that
test tool, but couldn't decide on this approach or toying with the rate 
limiting.
Definitly worth an option though imo.

Dave

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] ipv6: addrconf_ifdown fix dst refcounting.

2006-02-02 Thread YOSHIFUJI Hideaki / 吉藤英明

In article <[EMAIL PROTECTED]> (at Thu, 02 Feb 2006 20:09:44 -0800 (PST)), 
"David S. Miller" <[EMAIL PROTECTED]> says:

> From: YOSHIFUJI Hideaki <[EMAIL PROTECTED]>
> Date: Fri, 03 Feb 2006 11:32:13 +0900 (JST)
> 
> > BTW, David, would you mind sending your addrconf patches to me?
> > I'd like to look into it deeply.
> > I used to have clone, but I happened to remove that tree...
> 
> I intended to review my patches in small pieces, one by one,
> and then send them to you for review.
> 
> Maybe I can do this over the weekend, is that OK with you?

Sure, and that is what I'd like to do.

--yoshfuji
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [e2e] FW: Performance evaluation of high speed TCPs

2006-02-02 Thread Stephen Hemminger

On Fri, 3 Feb 2006 15:51:02 +1300
Ian McDonald <[EMAIL PROTECTED]> wrote:

> >  Seriously, where's the value in comparing buggy implementations - isn't
> > that just a waste of all our time ?  If we are genuine about wanting to
> > understand tcp performance then I think we just have to take the hit from
> > issues such as this that are outside all of our control.
> >
> A real part of the problem here is that the Linux doesn't have a full
> TCP testing suite and doesn't have build checking to check for
> regressions in TCP variants. As I understand the only thing tested in
> nightly builds is throughput for the default TCP.

I am starting do setup a regression test, but it still in the planning
stages. I hope to merge existing tests with existing automation tools.
The analysis might be more difficult than the tests though.

> Stephen Hemminger has done some work on TCP Probes but this is where I
> think real progress could be made in improving Linux TCP. I may get
> around to doing this myself at some point in my research but would
> welcome other people doing it also!
> 
> Ian
> --
> Ian McDonald
> http://wand.net.nz/~iam4
> WAND Network Research Group
> University of Waikato
> New Zealand
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] ipv6: addrconf_ifdown fix dst refcounting.

2006-02-02 Thread David S. Miller

From: YOSHIFUJI Hideaki <[EMAIL PROTECTED]>
Date: Fri, 03 Feb 2006 11:32:13 +0900 (JST)

> BTW, David, would you mind sending your addrconf patches to me?
> I'd like to look into it deeply.
> I used to have clone, but I happened to remove that tree...

I intended to review my patches in small pieces, one by one,
and then send them to you for review.

Maybe I can do this over the weekend, is that OK with you?
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: badness in dst_release

2006-02-02 Thread Stephen Hemminger

On Thu, 2 Feb 2006 20:37:51 -0500
Dave Jones <[EMAIL PROTECTED]> wrote:

> On Thu, Feb 02, 2006 at 04:49:29PM -0800, David S. Miller wrote:
>  > From: Dave Jones <[EMAIL PROTECTED]>
>  > Date: Thu, 2 Feb 2006 14:30:28 -0500
>  > 
>  > > Here's a second flavour.
>  > 
>  > Can you git bisect to figure out when this problem started
>  > to occur?
> 
> I'll give it a try sometime soon, though I'm up to my eyeballs chasing
> something else right now, so it may take me a while.
> 

I triggered this easily to day. will bisect tomorrow.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [patch 3/4] net: Percpufy frequently used variables -- proto.sockets_allocated

2006-02-02 Thread Andrew Morton

Ravikiran G Thirumalai <[EMAIL PROTECTED]> wrote:
>
> On Fri, Jan 27, 2006 at 03:01:06PM -0800, Andrew Morton wrote:
> > Ravikiran G Thirumalai <[EMAIL PROTECTED]> wrote:
> > 
> > 
> > > > 
> > > > If the benchmarks say that we need to.  If we cannot observe any 
> > > > problems
> > > > in testing of existing code and if we can't demonstrate any benefit from
> > > > the patched code then one option is to go off and do something else ;)
> > > 
> > > We first tried plain per-CPU counters for memory_allocated, found that 
> > > reads
> > > on memory_allocated was causing cacheline transfers, and then
> > > switched over to batching.  So batching reads is useful.  To avoid
> > > inaccuracy, we can maybe change percpu_counter_init to:
> > > 
> > > void percpu_counter_init(struct percpu_counter *fbc, int maxdev)
> > > 
> > > the percpu batching limit would then be maxdev/num_possible_cpus.  One 
> > > would
> > > use batching counters only when both reads and writes are frequent.  With
> > > the above scheme, we would go fetch cachelines from other cpus for read
> > > often only on large cpu counts, which is not any worse than the global
> > > counter alternative, but it would still be beneficial on smaller machines,
> > > without sacrificing a pre-set deviation.  
> > > 
> > > Comments?
> > 
> > Sounds sane.
> >
> 
> Here's an implementation which delegates tuning of batching to the user.  We
> don't really need local_t at all as percpu_counter_mod is not safe against
> interrupts and softirqs  as it is.  If we have a counter which could be
> modified in process context and irq/bh context, we just have to use a
> wrapper like percpu_counter_mod_bh which will just disable and enable bottom
> halves.  Reads on the counters are safe as they are atomic_reads, and the
> cpu local variables are always accessed by that cpu only.
> 
> (PS: the maxerr for ext2/ext3 is just guesstimate)

Well that's the problem.  We need to choose production-quality values for
use in there.

> Comments?

Using num_possible_cpus() in that header file is just asking for build
errors.  Probably best to uninline the function rather than adding the
needed include of cpumask.h.

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [patch 3/4] net: Percpufy frequently used variables -- proto.sockets_allocated

2006-02-02 Thread Ravikiran G Thirumalai

On Fri, Jan 27, 2006 at 03:01:06PM -0800, Andrew Morton wrote:
> Ravikiran G Thirumalai <[EMAIL PROTECTED]> wrote:
> 
> 
> > > 
> > > If the benchmarks say that we need to.  If we cannot observe any problems
> > > in testing of existing code and if we can't demonstrate any benefit from
> > > the patched code then one option is to go off and do something else ;)
> > 
> > We first tried plain per-CPU counters for memory_allocated, found that reads
> > on memory_allocated was causing cacheline transfers, and then
> > switched over to batching.  So batching reads is useful.  To avoid
> > inaccuracy, we can maybe change percpu_counter_init to:
> > 
> > void percpu_counter_init(struct percpu_counter *fbc, int maxdev)
> > 
> > the percpu batching limit would then be maxdev/num_possible_cpus.  One would
> > use batching counters only when both reads and writes are frequent.  With
> > the above scheme, we would go fetch cachelines from other cpus for read
> > often only on large cpu counts, which is not any worse than the global
> > counter alternative, but it would still be beneficial on smaller machines,
> > without sacrificing a pre-set deviation.  
> > 
> > Comments?
> 
> Sounds sane.
>

Here's an implementation which delegates tuning of batching to the user.  We
don't really need local_t at all as percpu_counter_mod is not safe against
interrupts and softirqs  as it is.  If we have a counter which could be
modified in process context and irq/bh context, we just have to use a
wrapper like percpu_counter_mod_bh which will just disable and enable bottom
halves.  Reads on the counters are safe as they are atomic_reads, and the
cpu local variables are always accessed by that cpu only.

(PS: the maxerr for ext2/ext3 is just guesstimate)

Comments?

Index: linux-2.6.16-rc1mm4/include/linux/percpu_counter.h
===
--- linux-2.6.16-rc1mm4.orig/include/linux/percpu_counter.h 2006-02-02 
11:18:54.0 -0800
+++ linux-2.6.16-rc1mm4/include/linux/percpu_counter.h  2006-02-02 
18:29:46.0 -0800
@@ -16,24 +16,32 @@
 
 struct percpu_counter {
atomic_long_t count;
+   int percpu_batch;
long *counters;
 };
 
-#if NR_CPUS >= 16
-#define FBC_BATCH  (NR_CPUS*2)
-#else
-#define FBC_BATCH  (NR_CPUS*4)
-#endif
 
-static inline void percpu_counter_init(struct percpu_counter *fbc)
+/* 
+ * Choose maxerr carefully. maxerr/num_possible_cpus indicates per-cpu 
batching 
+ * Set maximum tolerance for better performance on large systems.
+ */
+static inline void percpu_counter_init(struct percpu_counter *fbc, 
+   unsigned int maxerr)
 {
atomic_long_set(&fbc->count, 0);
-   fbc->counters = alloc_percpu(long);
+   fbc->percpu_batch = maxerr/num_possible_cpus();
+   if (fbc->percpu_batch) {
+   fbc->counters = alloc_percpu(long);
+   if (!fbc->counters)
+   fbc->percpu_batch = 0;
+   }
+   
 }
 
 static inline void percpu_counter_destroy(struct percpu_counter *fbc)
 {
-   free_percpu(fbc->counters);
+   if (fbc->percpu_batch)
+   free_percpu(fbc->counters);
 }
 
 void percpu_counter_mod(struct percpu_counter *fbc, long amount);
@@ -63,7 +71,8 @@ struct percpu_counter {
long count;
 };
 
-static inline void percpu_counter_init(struct percpu_counter *fbc)
+static inline void percpu_counter_init(struct percpu_counter *fbc, 
+   unsigned int maxerr)
 {
fbc->count = 0;
 }
Index: linux-2.6.16-rc1mm4/mm/swap.c
===
--- linux-2.6.16-rc1mm4.orig/mm/swap.c  2006-01-29 20:20:20.0 -0800
+++ linux-2.6.16-rc1mm4/mm/swap.c   2006-02-02 18:36:21.0 -0800
@@ -470,13 +470,20 @@ static int cpu_swap_callback(struct noti
 #ifdef CONFIG_SMP
 void percpu_counter_mod(struct percpu_counter *fbc, long amount)
 {
-   long count;
long *pcount;
-   int cpu = get_cpu();
+   long count;
+   int cpu;
 
+   /* Slow mode */
+   if (unlikely(!fbc->percpu_batch)) {
+   atomic_long_add(amount, &fbc->count);
+   return;
+   }
+   
+   cpu = get_cpu();
pcount = per_cpu_ptr(fbc->counters, cpu);
count = *pcount + amount;
-   if (count >= FBC_BATCH || count <= -FBC_BATCH) {
+   if (count >= fbc->percpu_batch || count <= -fbc->percpu_batch) {
atomic_long_add(count, &fbc->count);
count = 0;
}
Index: linux-2.6.16-rc1mm4/fs/ext2/super.c
===
--- linux-2.6.16-rc1mm4.orig/fs/ext2/super.c2006-02-02 18:30:28.0 
-0800
+++ linux-2.6.16-rc1mm4/fs/ext2/super.c 2006-02-02 18:36:39.0 -0800
@@ -610,6 +610,7 @@ static int ext2_fill_super(struct super_
int db_count;
int i, j;
__le32 features;
+   i

Re: [e2e] FW: Performance evaluation of high speed TCPs

2006-02-02 Thread Ian McDonald

>  Seriously, where's the value in comparing buggy implementations - isn't
> that just a waste of all our time ?  If we are genuine about wanting to
> understand tcp performance then I think we just have to take the hit from
> issues such as this that are outside all of our control.
>
A real part of the problem here is that the Linux doesn't have a full
TCP testing suite and doesn't have build checking to check for
regressions in TCP variants. As I understand the only thing tested in
nightly builds is throughput for the default TCP.

Stephen Hemminger has done some work on TCP Probes but this is where I
think real progress could be made in improving Linux TCP. I may get
around to doing this myself at some point in my research but would
welcome other people doing it also!

Ian
--
Ian McDonald
http://wand.net.nz/~iam4
WAND Network Research Group
University of Waikato
New Zealand
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [e2e] FW: Performance evaluation of high speed TCPs

2006-02-02 Thread rhee


Sure. Your comments about running the buggy implementation are well taken.
That is
why this type of reporting is helpful and we are committed to keep this
effort. Just that
it takes time to run the tests, and before we run a new set of tests, we
have to do some
batch of patches to reduce our effort level (but in this case of the HTCP
bug, rest assured
that we are running it now..it is just that there are a lot of other
things going on
that we have to catch a breath a little).

Then again, if we don't do the test and keep the report
up-to-date then it is difficult to find bugs as well...so these reportings
help us find
bugs and also improve TCP algorithms. (I hope our report did the same for
you).  Also
sometimes we are not motivated to find the bugs ourselves.

In fact, i contacted your student "Baruch" one month and half before  we
posted our
report -- it was CCed in the netdev mailing list as well and we gave him
login and
passwd on our result website (at that time we were just about to write the
report)
and we have not heard from your guys until just one week ago. At least we
did try to
make sure we are running a buggy version.




>>Seriously, we can't run the tests for every fix and bug report.
>
> Perhaps best to view it as returning a favour.  You may recall that we
> re-ran all our own experimental tests last year (all data and code
> available online at www.hamilton.ie/net/eval/) on discovering a previously
> unreported bug introduced by the linux folks when implementing bic.
> Something similar has happened with importing htcp into linux.
>
> Seriously, where's the value in comparing buggy implementations - isn't
> that just a waste of all our time ?  If we are genuine about wanting to
> understand tcp performance then I think we just have to take the hit from
> issues such as this that are outside all of our control.
>
> Doug
>
> Hamilton Institute
> www.hamilton.ie
>


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] ipv6: addrconf_ifdown fix dst refcounting.

2006-02-02 Thread YOSHIFUJI Hideaki / 吉藤英明

In article <[EMAIL PROTECTED]> (at Fri, 3 Feb 2006 12:48:49 +1100), Herbert Xu 
<[EMAIL PROTECTED]> says:

> On Fri, Feb 03, 2006 at 10:31:58AM +0900, YOSHIFUJI Hideaki / ?$B5HF#1QL@ 
> wrote:
> > 
> > We SHALL do autoconf when we "up" an ipv6-capable device.
> > It is the IPv6.
> 
> I don't think the word "SHALL" stops us from implementing it in
> user-space...

It is because my poor English...sigh...

Specification says an implementation do autoconf when we bringing up
an ipv6-capable device.  It is the nature and requirement of IPv6.

BTW, David, would you mind sending your addrconf patches to me?
I'd like to look into it deeply.
I used to have clone, but I happened to remove that tree...

Thanks.

--yoshfuji
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [e2e] FW: Performance evaluation of high speed TCPs

2006-02-02 Thread rhee

Doug,
Sorry that we are not THAT real time in updating the report :-)

Seriously, we can't run the tests for every fix and bug report. But
we are aware of your new patch posted last week on the e2e list and indeed
applied it to our testing platform for retesting.Now one test case is done
(thanks to
Sangtae who spent a few sleepless nights to set up and re-run the tests).
These tests take time to  rerun and they are still going on and when they
are done,
we will update the document.

At this point, i can confirm at least that HTCP performance looks a LOT
improved,
but we still found a few new issues even with the updated HTCP -- in the
same performance
areas that we pointed out in the document, such as utilization and
stability.  We are
looking to find out whether these issues are caused by side-effects of our
setup or
by the HTCP algorithm itself.
As soon as we get some more confirmation on our findings we will update
the report. Please bear with us on this and stay tuned.

I hope our report and testing help the community in studying and improving
TCP performance.

Regards,
Injong



>  Injong,
>
> Re your recent report, could you just confirm that the htcp results should
> be disregarded (I think updated results are on the web now though) as they
> reflect a bug in the linux htcp implementation rather than correct
> performance ?  Thanks.
>
> Doug
>
>


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] ipv6: addrconf_ifdown fix dst refcounting.

2006-02-02 Thread Herbert Xu

On Fri, Feb 03, 2006 at 10:31:58AM +0900, YOSHIFUJI Hideaki / ?$B5HF#1QL@ wrote:
> 
> We SHALL do autoconf when we "up" an ipv6-capable device.
> It is the IPv6.

I don't think the word "SHALL" stops us from implementing it in
user-space...

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Van Jacobson net channels

2006-02-02 Thread David S. Miller

From: Greg Banks <[EMAIL PROTECTED]>
Date: Fri, 03 Feb 2006 12:08:54 +1100

> So, given 2.6.16 on tg3 hardware, would your advice be to
> enable TSO by default?

Yes.

In fact I've been meaning to discuss with Michael Chan
enabling it in the driver by default.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: badness in dst_release

2006-02-02 Thread Dave Jones

On Thu, Feb 02, 2006 at 04:49:29PM -0800, David S. Miller wrote:
 > From: Dave Jones <[EMAIL PROTECTED]>
 > Date: Thu, 2 Feb 2006 14:30:28 -0500
 > 
 > > Here's a second flavour.
 > 
 > Can you git bisect to figure out when this problem started
 > to occur?

I'll give it a try sometime soon, though I'm up to my eyeballs chasing
something else right now, so it may take me a while.

Thanks,

Dave

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: fib_rules w. RCU lock [PATCH]

2006-02-02 Thread David S. Miller

From: Robert Olsson <[EMAIL PROTECTED]>
Date: Fri, 27 Jan 2006 14:20:39 +0100

>  The preempts are removed and an updated version of the patch is enclosed.

Applied to net-2.6.17, thanks Robert.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: tg3 bug noticed

2006-02-02 Thread David S. Miller

From: "Michael Chan" <[EMAIL PROTECTED]>
Date: Fri, 27 Jan 2006 09:32:09 -0800

> [TG3]: Flush tg3_reset_task()
> 
> Make sure tg3_reset_task() is flushed in the close and suspend paths
> as noted by Jeff Garzik.
> 
> In the close path, calling flush_scheduled_work() may cause deadlock
> if linkwatch_event() is on the workqueue. linkwatch_event() will try
> to get the rtnl_lock() which is already held by tg3_close(). So
> instead, we set a flag in tg3_reset_task() and tg3_close() polls
> the flag until it is cleared.
> 
> Signed-off-by: Michael Chan <[EMAIL PROTECTED]>

Applied, thanks Michael.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] ipv6: addrconf_ifdown fix dst refcounting.

2006-02-02 Thread YOSHIFUJI Hideaki / 吉藤英明

In article <[EMAIL PROTECTED]> (at Thu, 2 Feb 2006 23:42:25 +1100), Herbert Xu 
<[EMAIL PROTECTED]> says:

> On Thu, Feb 02, 2006 at 05:37:22AM -0700, Eric W. Biederman wrote:
> >
> > > Yes you are right.  The locking/refcounting in addrconf.c is such
> > > a mess.  I've asked a number of times before as to why most of
> > > this can't be done in user-space instead.  There is nothing performance
> > > critical here, and the system must be able to deal with a device with
> > > no IPv6 addresses anyway (think of the case when the device was up before
> > > ipv6.ko was loaded). 
> > 
> > A lot of the latter case is handled by the replay of netdevice events
> > when you register a netdevice notifier.
> 
> Yes.  What I meant is that it is normal to have a period of time during
> which a device has no IPv6 addresses attached.  Doing addrconf in the
> kernel means that we can guarantee that as soon as a device appears we
> slap on an IPv6 address.  My point is that we need to cope with devices
> without IPv6 addresses anyway.

We SHALL do autoconf when we "up" an ipv6-capable device.
It is the IPv6.

I agree that, in SOME cases, some people want to disable ipv6 on some of
their interfaces.

--yoshfuji
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] snap: needs hardware checksum fix

2006-02-02 Thread Herbert Xu

David S. Miller <[EMAIL PROTECTED]> wrote:
> 
> This patch made me notice that the length is sort of implicit
> or can be calculated given "start" and the current skb->data
> value.
> 
> Someone might want to look into making that simplification
> at some point.

Or we could simply merge skb_pull and skb_postpull_rcsum into one
function that does both.
 
Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [TEXTSEARCH] Fix broken good shift array calculation in Boyer-Moore

2006-02-02 Thread David S. Miller

From: Pablo Neira Ayuso <[EMAIL PROTECTED]>
Date: Mon, 30 Jan 2006 00:21:30 +0100

> This patch fixes a problem in the Boyer-Moore textsearch strategy.
> Please apply.

Applied, thanks Pablo.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: Van Jacobson net channels

2006-02-02 Thread Greg Banks

On Fri, 2006-02-03 at 01:41, Leonid Grossman wrote:
>  
> As I mentioned earlier, it would be cool to get these moderation
> tresholds from NAPI, since it can make a better guess about the overall
> system utilization than the driver can.

Agreed.

>  But even at the driver level,
> this works reasonably well.

Yep.

> - the moderation scheme is implemented in the ASIC on per channel basis.
> So, if you have workloads with very distinct latency needs, you can just
> steer it to a separate channel and have an interrupt moderation that is
> different from other flows, for example keep an interrupt per packet
> always.

Wow, that's cool.  So I could configure a particular UDP port and a
particular TCP port to always have minimum latency, but keep all the
rest of the traffic on the same NIC at minimum interrupts?  Currently
we need to use separate NICs for the two traffic types (for a number
of reasons).

What's the interface, some kind of ethtool extension or /proc magic?

Greg.
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH]ip_options_fragment() has no effect on fragmentation

2006-02-02 Thread David S. Miller

From: Wei Yongjun <[EMAIL PROTECTED]>
Date: Wed, 01 Feb 2006 14:21:41 -0500

> Resend Patch:
> --- linux-2.6.15.1/net/ipv4/ip_options.c.orig 2006-01-27
> 09:14:33.463612696 +0900
> +++ linux-2.6.15.1/net/ipv4/ip_options.c  2006-01-27 09:12:21.857619848
> +0900
> @@ -207,7 +207,7 @@
>  
>  void ip_options_fragment(struct sk_buff * skb) 
>  {
> - unsigned char * optptr = skb->nh.raw;
> + unsigned char * optptr = skb->nh.raw + sizeof(struct iphdr);
>   struct ip_options * opt = &(IPCB(skb)->opt);
>   int  l = opt->optlen;
>   int  optlen;

Your patch is still corrupt, new lines were added by your email client
which splits up the patch headers.

I applied the patch by hand, but next time I won't put so much effort
into fixing up your work.  Please learn how to submit patches
properly.

Thank you.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Van Jacobson net channels

2006-02-02 Thread Greg Banks

On Thu, 2006-02-02 at 18:51, David S. Miller wrote:
> From: Greg Banks <[EMAIL PROTECTED]>
> Date: Thu, 02 Feb 2006 18:31:49 +1100
> 
> > On Thu, 2006-02-02 at 17:45, Andi Kleen wrote: 
> > > Normally TSO was supposed to fix that.
> > 
> > Sure, except that the last time SGI looked at TSO it was
> > extremely flaky.  I gather that's much better now, but TSO
> > still has a very small size limit imposed by the stack (not
> > the hardware).
> 
> Oh you have TSO disabled?  That explains a lot.
> 
> Yes, it's been a bumpy road, and there are still some
> e1000 lockups, but in general things should be smooth
> these days.

So, given 2.6.16 on tg3 hardware, would your advice be to
enable TSO by default?

Greg
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Van Jacobson net channels

2006-02-02 Thread Greg Banks

On Thu, 2006-02-02 at 18:48, Andi Kleen wrote:
> On Thursday 02 February 2006 08:31, Greg Banks wrote:
> 
> > [...]SGI's solution is do is ship a script that uses ethtool
> > at boot to tune rx-usecs, rx-frames, rx-usecs-irq, rx-frames-irq
> > up from the defaults.
> 
> All user tuning like this is bad. The stack should all do that automatically.

That would be nice ;-)

> Would there be a drawback of making these
> settings default?

Yes, as mentioned elsewhere in this thread, applications which
are latency-sensitive will suffer.

For example, SGI sells a clustered filesystem where overall performance
is sensitive to the RTT of intra-cluster RPCs, to which receive latency
due to NIC interrupt mitigation is a significant factor.  The NICs which
run that traffic need to be using minimum mitigation, but the NICs which
run NFS traffic need to be using maximum mitigation.

> > This helps a lot, and we're very grateful ;-)   But a scheme
> > which used the interrupt mitigation hardware dynamically based on
> > load could reduce the irq rate and CPU usage even further without
> > compromising latency at low load.
> 
> If you know what's needed perhaps you could investigate it?

Maybe, in a couple of months when I've the time.

> You mean the 64k limit?

Exactly.  Currently the NFS server is limited to a 32K blocksize
so the largest RPC reply size is about 33K.  However the NFS client
in Linus' tree, and other OS's NFS servers, have much larger limits.
A value of about 1.001 MiB would probably be best.  The next SGI
Linux NFS server release will probably include a patch to increase
the maximum blocksize on TCP to 1MiB.

> > Cool.  Wouldn't it mean rewriting the nontrivial qdiscs?
> 
> It had some compat code that just split up the lists - same
> for netfilter. And only an implementation for pfifo_fast.

Ok by me, in practice our servers only ever use pfifo.

> (Don't ask for code - it's not really in an usable state)

Sure.  I'm looking forward to it.

Greg.
-- 
Greg Banks, R&D Software Engineer, SGI Australian Software Group.
I don't speak for SGI.

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] 1/1 net/core: use USEC_PER_SEC and line spacing

2006-02-02 Thread David S. Miller

From: Herbert Xu <[EMAIL PROTECTED]>
Date: Thu, 02 Feb 2006 09:06:34 +1100

> Ian McDonald <[EMAIL PROTECTED]> wrote:
> > On 2/1/06, Herbert Xu <[EMAIL PROTECTED]> wrote:
> >> Ian McDonald <[EMAIL PROTECTED]> wrote:
> >> >
> >> > --- a/net/core/sock.c
> >> > +++ b/net/core/sock.c
> >> > @@ -162,7 +162,8 @@ static int sock_set_timeout(long *timeo_
> >> >if (tv.tv_sec == 0 && tv.tv_usec == 0)
> >> >return 0;
> >> >if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
> >> > -   *timeo_p = tv.tv_sec*HZ + 
> >> > (tv.tv_usec+(100/HZ-1))/(100/HZ);
> >> > +   *timeo_p = tv.tv_sec*HZ +
> >> > +   
> >> > (tv.tv_usec+(USEC_PER_SEC/HZ-1))/(USEC_PER_SEC/HZ);
> >>
> >> Is there a macro for this calculation? If not could we add one?
> >>
> > I don't know if there is or not. There is similar code in DCCP. I
> 
> I just had a look and usecs_to_jiffies seems to do what you want.

Ian, can you redo your patch using usecs_to_jiffies?

Thanks.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Remove suprious use of goto out: in icmp_reply

2006-02-02 Thread David S. Miller

From: Horms <[EMAIL PROTECTED]>
Date: Wed, 1 Feb 2006 13:33:48 +0900

> This seems to be an artifact of the follwoing commit in February '02.
> http://www.kernel.org/git/?p=linux/kernel/git/tglx/history.git;a=history;h=e7e173af42dbf37b1d946f9ee00219cb3b2bea6a;f=net/ipv4/icmp.c
> 
> In a nutshell, goto out and return actually do the same thing,
> and both are called in this function. This patch removes out.
> 
> Signed-Off-By: Horms <[EMAIL PROTECTED]>

Applied, thanks Simon.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [lock validator] inet6_destroy_sock(): soft-safe -> soft-unsafe lock dependency

2006-02-02 Thread David S. Miller

From: Herbert Xu <[EMAIL PROTECTED]>
Date: Wed, 1 Feb 2006 21:42:14 +1100

> OK this is definitely broken.  We should never touch the dst lock in
> softirq context.  Since inet6_destroy_sock may be called from that
> context due to the asynchronous nature of sockets, we can't take the
> lock there.
> 
> In fact this sk_dst_reset is totally redundant since all IPv6 sockets
> use inet_sock_destruct as their socket destructor which always cleans
> up the dst anyway.  So the solution is to simply remove the call.
> 
> Signed-off-by: Herbert Xu <[EMAIL PROTECTED]>

Looks good, applied, thanks Herbert.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] [IPV4] Document icmp_errors_use_inbound_ifaddr sysctl

2006-02-02 Thread David S. Miller

From: Horms <[EMAIL PROTECTED]>
Date: Wed, 1 Feb 2006 17:32:18 +0900

> Taken largely from the commit of the patch that added this feature
> http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=1c2fb7f93cb20621772bf304f3dba0849942e5db
> 
> I'm not sure about the ordering of the options in sysctl.txt,
> so I took a wild guess about where it fits.
> 
> Signed-Off-By: Horms <[EMAIL PROTECTED]>

Applied, thanks Simon.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH][SCTP]: Fix 'fast retransmit' to send a TSN only once.

2006-02-02 Thread David S. Miller

From: Sridhar Samudrala <[EMAIL PROTECTED]>
Date: Wed, 01 Feb 2006 14:29:22 -0800

> SCTP used to "fast retransmit" a TSN every time we hit the number
> of missing reports for the TSN.  However the Implementers Guide
> specifies that we should only "fast retransmit" a given TSN once.
> Subsequent retransmits should be timeouts only. Also change the
> number of missing reports to 3 as per the latest IG(similar to TCP).
> 
> Signed-off-by: Vlad Yasevich <[EMAIL PROTECTED]>
> Signed-off-by: Sridhar Samudrala <[EMAIL PROTECTED]>

Applied, thanks Sridhar.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] ipv6: addrconf_ifdown fix dst refcounting.

2006-02-02 Thread David S. Miller

From: Herbert Xu <[EMAIL PROTECTED]>
Date: Thu, 2 Feb 2006 22:25:54 +1100

> [IPV6]: Don't hold extra ref count in ipv6_ifa_notify
> 
> Currently the logic in ipv6_ifa_notify is to hold an extra reference
> count for addrconf dst's that get added to the routing table.  Thus,
> when addrconf dst entries are taken out of the routing table, we need
> to drop that dst.  However, addrconf dst entries may be removed from
> the routing table by means other than __ipv6_ifa_notify.
> 
> So we're faced with the choice of either fixing up all places where
> addrconf dst entries are removed, or dropping the extra reference count
> altogether.
> 
> I chose the latter because the ifp itself always holds a dst reference
> count of 1 while it's alive.  This is dropped just before we kfree the
> ifp object.  Therefore we know that in __ipv6_ifa_notify we will always
> hold that count.
> 
> This bug was found by Eric W. Biederman.
> 
> Signed-off-by: Herbert Xu <[EMAIL PROTECTED]>

Applied, thanks Herbert.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] add CONFIG_NETDEBUG to suppress bad packet messages

2006-02-02 Thread Stephen Hemminger

On Thu, 02 Feb 2006 16:47:03 -0800 (PST)
"David S. Miller" <[EMAIL PROTECTED]> wrote:

> From: Stephen Hemminger <[EMAIL PROTECTED]>
> Date: Thu, 2 Feb 2006 16:35:01 -0800
> 
> > If you are on a hostile network, or are running protocol tests, you can
> > easily get the logged swamped by messages about bad UDP and ICMP packets.
> > This turns those messages off unless a config option is enabled.
> > 
> > Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]>
> 
> NETDEBUG should print out something by default.
> 
> We should fix the NETDEBUG() users.  Dave Jones recently fixed
> a case in IGMP, for example.
> 
> It should print out messages for cases that are impossible and really
> need investigation, and not for cases that can be triggered by random
> packets being sent from a remote system.

UDP short packet and checksum errors.

-- 
Stephen Hemminger <[EMAIL PROTECTED]>
OSDL http://developer.osdl.org/~shemminger
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] snap: needs hardware checksum fix

2006-02-02 Thread David S. Miller

From: Stephen Hemminger <[EMAIL PROTECTED]>
Date: Thu, 2 Feb 2006 16:32:31 -0800

> The SNAP code pops off it's 5 byte header, but doesn't adjust
> the checksum. This would cause problems when using device that
> does IP over SNAP and hardware receive checksums.
> 
> Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]>

Applied, thanks Stephen.

This patch made me notice that the length is sort of implicit
or can be calculated given "start" and the current skb->data
value.

Someone might want to look into making that simplification
at some point.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: badness in dst_release

2006-02-02 Thread David S. Miller

From: Dave Jones <[EMAIL PROTECTED]>
Date: Thu, 2 Feb 2006 14:30:28 -0500

> Here's a second flavour.

Can you git bisect to figure out when this problem started
to occur?

Thanks a lot.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] add CONFIG_NETDEBUG to suppress bad packet messages

2006-02-02 Thread David S. Miller

From: Stephen Hemminger <[EMAIL PROTECTED]>
Date: Thu, 2 Feb 2006 16:35:01 -0800

> If you are on a hostile network, or are running protocol tests, you can
> easily get the logged swamped by messages about bad UDP and ICMP packets.
> This turns those messages off unless a config option is enabled.
> 
> Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]>

NETDEBUG should print out something by default.

We should fix the NETDEBUG() users.  Dave Jones recently fixed
a case in IGMP, for example.

It should print out messages for cases that are impossible and really
need investigation, and not for cases that can be triggered by random
packets being sent from a remote system.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Bugme-new] [Bug 5999] New: Iptables modules fail to load on Alpha arch

2006-02-02 Thread David S. Miller

From: Andrew Morton <[EMAIL PROTECTED]>
Date: Thu, 2 Feb 2006 16:34:54 -0800

> Odd.  Not sure what the "Could not allocate 60 bytes percpu data" is
> due to, either.

As the user indicates this problem goes all the way back to 2.6.9, I
really think it's likely some Alpha specific problem wrt. percpu
allocations.

Some Alpha expert should look into this from that angle.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] add CONFIG_NETDEBUG to suppress bad packet messages

2006-02-02 Thread Stephen Hemminger

If you are on a hostile network, or are running protocol tests, you can
easily get the logged swamped by messages about bad UDP and ICMP packets.
This turns those messages off unless a config option is enabled.

Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]>


--- br-2.6.orig/include/net/sock.h
+++ br-2.6/include/net/sock.h
@@ -1354,12 +1354,12 @@ extern int sock_get_timestamp(struct soc
  * Enable debug/info messages 
  */
 
-#if 0
-#define NETDEBUG(fmt, args...) do { } while (0)
-#define LIMIT_NETDEBUG(fmt, args...) do { } while(0)
-#else
+#ifdef CONFIG_NETDEBUG
 #define NETDEBUG(fmt, args...) printk(fmt,##args)
 #define LIMIT_NETDEBUG(fmt, args...) do { if (net_ratelimit()) 
printk(fmt,##args); } while(0)
+#else
+#define NETDEBUG(fmt, args...) do { } while (0)
+#define LIMIT_NETDEBUG(fmt, args...) do { } while(0)
 #endif
 
 /*
--- br-2.6.orig/net/Kconfig
+++ br-2.6/net/Kconfig
@@ -27,6 +27,13 @@ if NET
 
 menu "Networking options"
 
+config NETDEBUG
+   bool "Network packet debugging"
+   help
+ You can say Y here if you want to get additional messages useful in
+ debugging bad packets, but can overwhelm logs under denial of service
+ attacks.
+
 source "net/packet/Kconfig"
 source "net/unix/Kconfig"
 source "net/xfrm/Kconfig"
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] snap: needs hardware checksum fix

2006-02-02 Thread Stephen Hemminger

The SNAP code pops off it's 5 byte header, but doesn't adjust
the checksum. This would cause problems when using device that
does IP over SNAP and hardware receive checksums.

Signed-off-by: Stephen Hemminger <[EMAIL PROTECTED]>


--- br-2.6.orig/net/802/psnap.c
+++ br-2.6/net/802/psnap.c
@@ -59,8 +59,10 @@ static int snap_rcv(struct sk_buff *skb,
proto = find_snap_client(skb->h.raw);
if (proto) {
/* Pass the frame on. */
+   u8 *hdr = skb->data;
skb->h.raw  += 5;
skb_pull(skb, 5);
+   skb_postpull_rcsum(skb, hdr, 5);
rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev);
} else {
skb->sk = NULL;
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Fw: [Bugme-new] [Bug 5999] New: Iptables modules fail to load on Alpha arch

2006-02-02 Thread Andrew Morton


Odd.  Not sure what the "Could not allocate 60 bytes percpu data" is due to, 
either.

Begin forwarded message:

Date: Thu, 2 Feb 2006 15:13:29 -0800
From: [EMAIL PROTECTED]
To: [EMAIL PROTECTED]
Subject: [Bugme-new] [Bug 5999] New: Iptables modules fail to load on Alpha arch


http://bugzilla.kernel.org/show_bug.cgi?id=5999

   Summary: Iptables modules fail to load on Alpha arch
Kernel Version: 2.6.15.2
Status: NEW
  Severity: high
 Owner: [EMAIL PROTECTED]
 Submitter: [EMAIL PROTECTED]


Most recent kernel where this bug did not occur: Unknown
Distribution: CentOS 4.2
Hardware Environment:
cpu : Alpha
cpu model   : EV67
cpu variation   : 7
cpu revision: 0
cpu serial number   : AY00607688
system type : Tsunami
system variation: Clipper
system revision : 0
system serial number: 4051DPSZ
cycle frequency [Hz]: 6
timer frequency [Hz]: 1024.00
page size [bytes]   : 8192
phys. address bits  : 44
max. addr. space #  : 255
BogoMIPS: 1305.32
kernel unaligned acc: 0 (pc=0,va=0)
user unaligned acc  : 0 (pc=0,va=0)
platform string : AlphaServer ES40
cpus detected   : 2
cpus active : 2
cpu active mask : 0003
L1 Icache   : 64K, 2-way, 64b line
L1 Dcache   : 64K, 2-way, 64b line
L2 cache: 8192K, 1-way, 64b line

MemTotal:  3095248 kB
MemFree:   2324784 kB
Buffers: 52592 kB
Cached: 587248 kB
SwapCached:  0 kB
Active: 437152 kB
Inactive:   245064 kB
HighTotal:   0 kB
HighFree:0 kB
LowTotal:  3095248 kB
LowFree:   2324784 kB
SwapTotal:  530128 kB
SwapFree:   530128 kB
Dirty:   0 kB
Writeback:   0 kB
Mapped:  61320 kB
Slab:72256 kB
Committed_AS:52216 kB
PageTables:904 kB
VmallocTotal:  8388608 kB
VmallocUsed:  5368 kB
VmallocChunk:  8382840 kB

 
Problem Description:
When trying to load the iptables service I get the following error:

Feb  2 16:05:56 alphacrow kernel: ip_tables: (C) 2000-2002 Netfilter core team
Feb  2 16:05:56 alphacrow kernel: Could not allocate 60 bytes percpu data
Feb  2 16:05:56 alphacrow modprobe: WARNING: Error inserting ip_conntrack
(/lib/modules/2.6.9-22.0.2.ECsmp/kernel/net/ipv4/netfilter/ip_conntrack.ko):
Cannot allocate memory
Feb  2 16:05:56 alphacrow kernel: ipt_state: Unknown symbol 
ip_conntrack_untracked
Feb  2 16:05:56 alphacrow kernel: ipt_state: Unknown symbol need_ip_conntrack
Feb  2 16:05:56 alphacrow modprobe: FATAL: Error inserting ipt_state
(/lib/modules/2.6.9-22.0.2.ECsmp/kernel/net/ipv4/netfilter/ipt_state.ko):
Unknown symbol in module, or unknown parameter (see dmesg)
Feb  2 16:05:56 alphacrow iptables:  failed

Occurs on both the 2.6.9 kernel shipped with CentOS 4.2 and also the official
2.6.15.2 kernel I built.

Steps to reproduce: []# service iptables start

--- You are receiving this mail because: ---
You are on the CC list for the bug, or are watching someone who is.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[patch 2/2] natsemi: NAPI and a bugfix

2006-02-02 Thread Mark Brown

As documented in National application note 1287 the RX state machine on
the natsemi chip can lock up under some conditions (mostly related to
heavy load).  When this happens a series of bogus packets are reported
by the chip including some oversized frames prior to the final lockup.

This patch implements the fix from the application note: when an
oversized packet is reported it resets the RX state machine, dropping
any currently pending packets.

Signed-off-by: Mark Brown <[EMAIL PROTECTED]>

Index: linux-2.6.15.2/drivers/net/natsemi.c
===
--- linux-2.6.15.2.orig/drivers/net/natsemi.c   2006-02-01 22:59:29.0 
+
+++ linux-2.6.15.2/drivers/net/natsemi.c2006-02-02 00:05:23.0 
+
@@ -1498,6 +1498,31 @@ static void natsemi_reset(struct net_dev
writel(rfcr, ioaddr + RxFilterAddr);
 }
 
+static void reset_rx(struct net_device *dev)
+{
+   int i;
+   struct netdev_private *np = netdev_priv(dev);
+   void __iomem *ioaddr = ns_ioaddr(dev);
+
+   np->intr_status &= ~RxResetDone;
+
+   writel(RxReset, ioaddr + ChipCmd);
+
+   for (i=0;iintr_status |= readl(ioaddr + IntrStatus);
+   if (np->intr_status & RxResetDone)
+   break;
+   udelay(15);
+   }
+   if (i==NATSEMI_HW_TIMEOUT) {
+   printk(KERN_WARNING "%s: RX reset did not complete in %d 
usec.\n",
+  dev->name, i*15);
+   } else if (netif_msg_hw(np)) {
+   printk(KERN_WARNING "%s: RX reset took %d usec.\n",
+  dev->name, i*15);
+   }
+}
+
 static void natsemi_reload_eeprom(struct net_device *dev)
 {
struct netdev_private *np = netdev_priv(dev);
@@ -2292,6 +2317,23 @@ static void netdev_rx(struct net_device 
"status %#08x.\n", dev->name,
np->cur_rx, desc_status);
np->stats.rx_length_errors++;
+
+   /* The RX state machine has probably
+* locked up beneath us.  Follow the
+* reset procedure documented in
+* AN-1287. */
+
+   spin_lock_irq(&np->lock);
+   reset_rx(dev);
+   reinit_rx(dev);
+   writel(np->ring_dma, ioaddr + RxRingPtr);
+   check_link(dev);
+   spin_unlock_irq(&np->lock);
+
+   /* We'll enable RX on exit from this
+* function. */
+   break;
+
} else {
/* There was an error. */
np->stats.rx_errors++;

--
"You grabbed my hand and we fell into it, like a daydream - or a fever."
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[patch 1/2] natsemi: NAPI and a bugfix

2006-02-02 Thread Mark Brown

This patch converts the natsemi driver to use NAPI.  It was originally
based on one written by Harald Welte, though it has since been modified
quite a bit, most extensively in order to remove the ability to disable
NAPI since none of the other drivers seem to provide that functionality
any more.

Signed-off-by: Mark Brown <[EMAIL PROTECTED]>

Index: linux-2.6.15.2/drivers/net/natsemi.c
===
--- linux-2.6.15.2.orig/drivers/net/natsemi.c   2006-01-31 06:25:07.0 
+
+++ linux-2.6.15.2/drivers/net/natsemi.c2006-02-01 22:59:29.0 
+
@@ -3,6 +3,7 @@
Written/copyright 1999-2001 by Donald Becker.
Portions copyright (c) 2001,2002 Sun Microsystems ([EMAIL PROTECTED])
Portions copyright 2001,2002 Manfred Spraul ([EMAIL PROTECTED])
+   Portions copyright 2004 Harald Welte <[EMAIL PROTECTED]>
 
This software may be used and distributed according to the terms of
the GNU General Public License (GPL), incorporated herein by reference.
@@ -135,8 +136,6 @@
 
TODO:
* big endian support with CFG:BEM instead of cpu_to_le32
-   * support for an external PHY
-   * NAPI
 */
 
 #include 
@@ -160,6 +159,7 @@
 #include 
 #include 
 #include 
+#include 
 #include  /* Processor type for cache alignment. */
 #include 
 #include 
@@ -183,8 +183,6 @@
 NETIF_MSG_TX_ERR)
 static int debug = -1;
 
-/* Maximum events (Rx packets, etc.) to handle at each interrupt. */
-static int max_interrupt_work = 20;
 static int mtu;
 
 /* Maximum number of multicast addresses to filter (vs. rx-all-multicast).
@@ -251,14 +249,11 @@ MODULE_AUTHOR("Donald Becker <[EMAIL PROTECTED]
 MODULE_DESCRIPTION("National Semiconductor DP8381x series PCI Ethernet 
driver");
 MODULE_LICENSE("GPL");
 
-module_param(max_interrupt_work, int, 0);
 module_param(mtu, int, 0);
 module_param(debug, int, 0);
 module_param(rx_copybreak, int, 0);
 module_param_array(options, int, NULL, 0);
 module_param_array(full_duplex, int, NULL, 0);
-MODULE_PARM_DESC(max_interrupt_work, 
-   "DP8381x maximum events handled per interrupt");
 MODULE_PARM_DESC(mtu, "DP8381x MTU (all boards)");
 MODULE_PARM_DESC(debug, "DP8381x default debug level");
 MODULE_PARM_DESC(rx_copybreak, 
@@ -691,6 +686,8 @@ struct netdev_private {
/* Based on MTU+slack. */
unsigned int rx_buf_sz;
int oom;
+   /* Interrupt status */
+   u32 intr_status;
/* Do not touch the nic registers */
int hands_off;
/* external phy that is used: only valid if dev->if_port != PORT_TP */
@@ -748,7 +745,8 @@ static void init_registers(struct net_de
 static int start_tx(struct sk_buff *skb, struct net_device *dev);
 static irqreturn_t intr_handler(int irq, void *dev_instance, struct pt_regs 
*regs);
 static void netdev_error(struct net_device *dev, int intr_status);
-static void netdev_rx(struct net_device *dev);
+static int natsemi_poll(struct net_device *dev, int *budget);
+static void netdev_rx(struct net_device *dev, int *work_done, int work_to_do);
 static void netdev_tx_done(struct net_device *dev);
 static int natsemi_change_mtu(struct net_device *dev, int new_mtu);
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -776,6 +774,18 @@ static inline void __iomem *ns_ioaddr(st
return (void __iomem *) dev->base_addr;
 }
 
+static inline void natsemi_irq_enable(struct net_device *dev)
+{
+   writel(1, ns_ioaddr(dev) + IntrEnable);
+   readl(ns_ioaddr(dev) + IntrEnable);
+}
+
+static inline void natsemi_irq_disable(struct net_device *dev)
+{
+   writel(0, ns_ioaddr(dev) + IntrEnable);
+   readl(ns_ioaddr(dev) + IntrEnable);
+}
+
 static void move_int_phy(struct net_device *dev, int addr)
 {
struct netdev_private *np = netdev_priv(dev);
@@ -879,6 +889,7 @@ static int __devinit natsemi_probe1 (str
spin_lock_init(&np->lock);
np->msg_enable = (debug >= 0) ? (1intr_status = 0;
 
/* Initial port:
 * - If the nic was configured to use an external phy and if find_mii
@@ -932,6 +943,9 @@ static int __devinit natsemi_probe1 (str
dev->do_ioctl = &netdev_ioctl;
dev->tx_timeout = &tx_timeout;
dev->watchdog_timeo = TX_TIMEOUT;
+   dev->poll = natsemi_poll;
+   dev->weight = 64;
+
 #ifdef CONFIG_NET_POLL_CONTROLLER
dev->poll_controller = &natsemi_poll_controller;
 #endif
@@ -2158,68 +2172,92 @@ static void netdev_tx_done(struct net_de
}
 }
 
-/* The interrupt handler does all of the Rx thread work and cleans up
-   after the Tx thread. */
+/* The interrupt handler doesn't actually handle interrupts itself, it
+ * schedules a NAPI poll if there is anything to do. */
 static irqreturn_t intr_handler(int irq, void *dev_instance, struct pt_regs 
*rgs)
 {
struct net_device *dev = dev_instance;
struct netdev_private *np = netdev_priv(dev);
void __iomem *

[patch 0/2] natsemi: NAPI and a bugfix

2006-02-02 Thread Mark Brown

These patches provide a series of updates to the natsemi driver: the
NAPI patch I've submitted before and a workaround for an issue with the
hardware that is easier to provoke at higher data rates.

  1/2: Convert the driver to NAPI
  2/2: Fix hardware issue with RX state machine lock up

--
"You grabbed my hand and we fell into it, like a daydream - or a fever."
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [wireless-2.6] d80211/ieee80211 symbol clash

2006-02-02 Thread Jouni Malinen

On Thu, Feb 02, 2006 at 02:55:43PM -0800, shemminger wrote:

> But then the module autoloader is going to work correctly, and what
> about people using drivers that use conflicting stacks? Get the
> namespace issues sorted out before you consider getting it into the
> mainline.

I thought that only one IEEE 802.11 stack was going to be going into the
mainline, i.e., this issue will be automatically resolved at that point
and it only applies to the non-mainline tree that allows both
implementations to be used temporarily while we go through selecting
what we want to see in the mainline tree.

-- 
Jouni MalinenPGP id EFC895FA
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [wireless-2.6] d80211/ieee80211 symbol clash

2006-02-02 Thread Jouni Malinen

On Thu, Feb 02, 2006 at 06:01:45PM -0500, Jeff Garzik wrote:

> Avoiding namespace clashes means that you avoid confusing all manner of 
> common developer tools.  In particular, 'make allyesconfig' is a 
> valuable developer tool that should not be broken.

I agree with this completely; I just would like to avoid the issues of
doing large scale renames for a case that will hopefully be resolved
anyway shortly by ending up with just one implementation.

> Don't take this as an endorsement for mass renaming, however.  The 
> smallest PRACTICAL solution may be simply renaming one of the clashing 
> functions to ieee80211_rcv, for example.

This would certainly be easier to handle than full renaming of all
function names in net/d80211.

-- 
Jouni MalinenPGP id EFC895FA
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [wireless-2.6] d80211/ieee80211 symbol clash

2006-02-02 Thread Jeff Garzik


Jouni Malinen wrote:

On Thu, Feb 02, 2006 at 08:36:26PM +0100, Michael Buesch wrote:



net/ieee80211/built-in.o: In function `ieee80211_rx':
: multiple definition of `ieee80211_rx'
net/d80211/built-in.o:: first defined here




But how to solve it? I suggest to change all dscape function prefixes
from ieee80211_FOO to d80211_FOO.



I would rather not see this kind of change in the function names in
net/d80211. I would be willing to live with the clashing symbols for the
time being since the goal is to get into one stack in the end anyway.
Linking in both stacks staticly or loading both as kernel modules at the
same time is something that I don't see as a very strong requirement at
the moment.


It may be a non-obvious need, but the need nonetheless exists.

Avoiding namespace clashes means that you avoid confusing all manner of 
common developer tools.  In particular, 'make allyesconfig' is a 
valuable developer tool that should not be broken.


Don't take this as an endorsement for mass renaming, however.  The 
smallest PRACTICAL solution may be simply renaming one of the clashing 
functions to ieee80211_rcv, for example.


Jeff


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [wireless-2.6] d80211/ieee80211 symbol clash

2006-02-02 Thread shemminger

On Thu, 2006-02-02 at 13:56 -0800, Jouni Malinen wrote:
> On Thu, Feb 02, 2006 at 08:36:26PM +0100, Michael Buesch wrote:
> 
> > net/ieee80211/built-in.o: In function `ieee80211_rx':
> > : multiple definition of `ieee80211_rx'
> > net/d80211/built-in.o:: first defined here
> 
> > But how to solve it? I suggest to change all dscape function prefixes
> > from ieee80211_FOO to d80211_FOO.
> 
> I would rather not see this kind of change in the function names in
> net/d80211. I would be willing to live with the clashing symbols for the
> time being since the goal is to get into one stack in the end anyway.
> Linking in both stacks staticly or loading both as kernel modules at the
> same time is something that I don't see as a very strong requirement at
> the moment.

But then the module autoloader is going to work correctly, and what
about people using drivers that use conflicting stacks? Get the
namespace issues sorted out before you consider getting it into the
mainline.

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Kernel BUG at drivers/net/tg3.c:2914 on SMP amd64

2006-02-02 Thread Mike Crowe

On Thu, 2006-02-02 at 22:05 +, Mike Crowe wrote:
>> Perhaps this new problem is unrelated. I shall try and reproduce
>> it. Is there anything I should try when the interface is locked up to
>> help?
 
On Thu, Feb 02, 2006 at 12:49:53PM -0800, Michael Chan wrote:
> Did this setup use to work before?

It's a new machine.

Having brought the interface back up again as mentioned in my last
email and left it under network load for a while I returned to the
machine to find an MCE. I'm suspicious that there may be a hardware
problem or a more deep rooted software problem. :(

I'll try and reproduce the problem again tomorrow. I can't reboot the
machine remotely to continue today. :(

CPU 2: Machine Check Exception:4 Bank 4: b2070f0f
TSC 92c1e455f5a 

CPU 0: Machine Check Exception:4 Bank 4: b2070f0f
TSC 92c1e65b4c9 
Kernel panic - not syncing: Machine check
 NMI Watchdog detected LOCKUP on CPU 2
CPU 2 
Modules linked in: nfsd nfs lockd nfs_acl sunrpc autofs4 ipv6 megaraid dm_mod 
ide_disk joydev evdev i2c_amd8111 i2c_amd756 shpchp psmouse i2c_core pcspkr 
serio_raw pci_hotplug hw_random xfs exportfs ide_cd cdrom ide_generic sd_mod 
generic megaraid_mbox amd74xx e100 scsi_mod mii ohci_hcd tg3 megaraid_mm 
ide_core thermal processor fan
Pid: 0, comm: swapper Tainted: G   M  2.6.15-1-amd64-k8-smp #1
RIP: 0010:[] {__smp_call_function+106}
RSP: 0018:8100f9fb4cb8  EFLAGS: 0093
RAX: 0001 RBX: 0003 RCX: 0004
RDX:  RSI:  RDI: 804116a0
RBP:  R08: 0020 R09: 0004
R10: 0004 R11:  R12: 801178f4
R13:  R14: 092c1e455a9b R15: 802f255b
FS:  2b2c5700() GS:8040a900() knlGS:
CS:  0010 DS: 0018 ES: 0018 CR0: 8005003b
CR2: 2aac CR3: 0001fbd29000 CR4: 06e0
Process swapper (pid: 0, threadinfo 8100f9fb, task 8100f9faa0c0)
Stack: 801178f4  0001  
   0097    
   8033d6a0 80117933 
Call Trace: <#MC> {smp_really_stop_cpu+0} 
{smp_send_stop+53}
   {panic+210} {oops_begin+92}
   {print_mce+136} {mce_available+0}
   {do_machine_check+749} 
{machine_check+127}
   {timer_interrupt+99} 
{handle_IRQ_event+41}
   {__do_IRQ+147} {do_IRQ+45}
   {ret_from_intr+0}   
{thread_return+0}
   {default_idle+57} {cpu_idle+93}
   

Code: 8b 44 24 10 39 d8 75 f6 85 ed 74 12 8b 44 24 14 39 d8 74 0a 
console shuts up ...
 <0>
CPU 0: Machine Check Exception:4 Bank 4: b2070f0f
Kernel panic - not syncing: Aiee, killing interrupt handler!
 <0>TSC 92c1e65b4c9 
Kernel panic - not syncing: Machine check
 NMI Watchdog detected LOCKUP on CPU 0
CPU 0 
Modules linked in: nfsd nfs lockd nfs_acl sunrpc autofs4 ipv6 megaraid dm_mod 
ide_disk joydev evdev i2c_amd8111 i2c_amd756 shpchp psmouse i2c_core pcspkr 
serio_raw pci_hotplug hw_random xfs exportfs ide_cd cdrom ide_generic sd_mod 
generic megaraid_mbox amd74xx e100 scsi_mod mii ohci_hcd tg3 megaraid_mm 
ide_core thermal processor fan
Pid: 0, comm: swapper Tainted: G   M  2.6.15-1-amd64-k8-smp #1
RIP: 0010:[] {__smp_call_function+106}
RSP: 0018:803b3d78  EFLAGS: 0097
RAX: 0001 RBX: 0002 RCX: 0003
RDX:  RSI:  RDI: 804116a0
RBP:  R08: 0020 R09: 0003
R10: 0003 R11:  R12: 801178f4
R13:  R14: 092c1e65b132 R15: 802f255b
FS:  2af15090() GS:8040a800() knlGS:
CS:  0010 DS: 0018 ES: 0018 CR0: 8005003b
CR2: 0058d824 CR3: 00101000 CR4: 06e0
Process swapper (pid: 0, threadinfo 80416000, task 8033a6a0)
Stack: 801178f4  0001  
   0400 0001   
   8033d6a0 80117933 
Call Trace: <#MC> {smp_really_stop_cpu+0} 
{smp_send_stop+53}
   {panic+210} {oops_begin+92}
   {print_mce+136} {mce_available+0}
   {do_machine_check+749} 
{machine_check+127}
   {default_idle+57}   
{cpu_idle+93}
   {start_kernel+442} 
{x86_64_start_kernel+423}
   

Code: 8b 44 24 10 39 d8 75 f6 85 ed 74 12 8b 44 24 14 39 d8 74 0a 
console shuts up ...
 NMI Watchdog detected LOCKUP on CPU 3
CPU 3 
Modules linked in: nfsd nfs lockd nfs_acl sunrpc autofs4 ipv6 megaraid dm_mod 
ide_disk joydev evdev i2c_amd8111 i2c_amd756 shpchp psmouse i2c_core pcspkr 
serio_raw pci_hotplug hw_random xfs exportfs ide_cd cdrom ide_generic sd_mod 
generic megaraid_mbox amd74xx e100 scsi_mod mii ohci_hcd tg3 megaraid_mm 
ide_core thermal processor fan
Pid

Re: Kernel BUG at drivers/net/tg3.c:2914 on SMP amd64

2006-02-02 Thread Michael Chan

On Thu, 2006-02-02 at 22:05 +, Mike Crowe wrote:

> Perhaps this new problem is unrelated. I shall try and reproduce
> it. Is there anything I should try when the interface is locked up to
> help?
> 

1. Run ethtool -d eth? >  before you get a NETDEV WATCHDOG
timeout and before you do ifdown. Please send me the output.

2. Check /proc/interrupts to see if the device is still generating
interrupts. You can send some packets to it or disconnect the network
cable to generate interrupts.

3. Run ethtool -S eth? to see if there are any errors.

Did this setup use to work before?

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Kernel BUG at drivers/net/tg3.c:2914 on SMP amd64

2006-02-02 Thread Mike Crowe

On Thu, 2006-02-02 at 13:37 +, I wrote:
>> I'm running the Debian 2.6.15 kernel from backports.org on a machine
>> with two Opteron 275s. I am getting a BUG in tg3.c quite reliably if I
>> ping flood the machine from a few others and cause a bit of other
>> network activity. Sometimes it takes a few minutes, sometimes half an
>> hour. The BUG also fires in more realistic situations - it just takes
>> longer to reproduce.
> 
On Thu, Feb 02, 2006 at 08:01:57AM -0800, Michael Chan wrote:
> Most likely due to MMIO being re-ordered. We've seen this on a number of
> AMD machines.
> 
> Please try this test patch below. If the problem goes away, send me the
> output of lspci -vvvxxx on your machine and I'll create a patch to fix
> this automatically on your machine. Thanks.

[patch snipped]

I thought it was working quite well with the patch - it certainly
seemed to stay up for longer than it had done previously.

Unfortunately after several hours of being ping flooded the network
interface has just stopped working completely. No responses to ping
requests at all. The machine looks up and happy from the serial
console and nothing has appeared in the kernel log. Sending ping
packets from the host itself doesn't increase the packet counts as
reported by ifconfig.

Taking the interface down and back up again recovered the situation
and the machine is pinging again.

Perhaps this new problem is unrelated. I shall try and reproduce
it. Is there anything I should try when the interface is locked up to
help?

-- 
Mike Crowe
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [wireless-2.6] d80211/ieee80211 symbol clash

2006-02-02 Thread Jouni Malinen

On Thu, Feb 02, 2006 at 08:36:26PM +0100, Michael Buesch wrote:

> net/ieee80211/built-in.o: In function `ieee80211_rx':
> : multiple definition of `ieee80211_rx'
> net/d80211/built-in.o:: first defined here

> But how to solve it? I suggest to change all dscape function prefixes
> from ieee80211_FOO to d80211_FOO.

I would rather not see this kind of change in the function names in
net/d80211. I would be willing to live with the clashing symbols for the
time being since the goal is to get into one stack in the end anyway.
Linking in both stacks staticly or loading both as kernel modules at the
same time is something that I don't see as a very strong requirement at
the moment.

-- 
Jouni MalinenPGP id EFC895FA
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC] Poor Network Performance with e1000 on 2.6.14.3

2006-02-02 Thread Ben Greear


Rick Jones wrote:

I haven't been able to get a TCP connection to saturate a 1Gbps link
in both directions simultaneously.  I *have* been able to fully saturate
2 pro/1000 NICs on the same machine using pktgen, so the NIC/driver can
support it if only TCP can run fast enough...



It isn't quite saturating, but:


I wrote a small tool that does very little other than full-duplex (on a single 
socket)
non-blocking IO.  I have this running between one machine with dual 3.4
Xeons and another machine with dual 2.8Ghz Xeons.  The busses are at least
PCI-X 64/100Mhz, and I'm using the 10Gbe NICs from Chelsio.  MTU is 1500.

With 64k read/write sizes, 8M send/rcv socket buffers, I am sustaining
about 1.37Gbps in both directions (counting ethernet overhead,
the rate is about 1.41Gbps in both directions.)

So, full GigE should be possible with GigE NICs as well...

Ben

--
Ben Greear <[EMAIL PROTECTED]>
Candela Technologies Inc  http://www.candelatech.com

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: Van Jacobson net channels and NIC channels

2006-02-02 Thread Leonid Grossman

 

> -Original Message-
> From: Andi Kleen [mailto:[EMAIL PROTECTED] 

> Why are you saying it can't be used by the host? The stack 
> should be fully ready for it.

Sorry, I should have said "it can't be used by the host to the full
potential of the feature" :-).
It does work for us now, as a "driver only" implementation, but setting
IRQ affinity from the kernel (as well as couple other decisions that we
would like host to make, rather than making them in the driver) should
help quite a bit.

> 
> The only small piece missing is a way to set the IRQ affinity 
> from the kernel, but that can be simulated from user space by 
> tweaking them in /proc. If you have a prototype patch adding 
> the kernel interfaces wouldn't be that hard neither.

Agreed, at this point we should put a patch forward and tweak the kernel
interface later on.

> 
> Also how about per CPU TX completion interrupts? 

Yes, a channel can have separate Tx completion and RX MSI-X interrupts
(and an exception MSI-X interrupt, if desired). It's up to 64 MSI-X
interrupts total.

> 
> -Andi
> 
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [wireless-2.6] d80211/ieee80211 symbol clash

2006-02-02 Thread Michael Buesch

On Thursday 02 February 2006 21:04, John W. Linville wrote:
> On Thu, Feb 02, 2006 at 08:36:26PM +0100, Michael Buesch wrote:
> > Not sure, if I am the first one compiling the domesday branch,
> > but the ieee80211_rx symbol clashes:
> > 
> > net/ieee80211/built-in.o: In function `ieee80211_rx':
> > : multiple definition of `ieee80211_rx'
> > net/d80211/built-in.o:: first defined here
> > 
> > Maybe people always compile the stacks as modules, so this does
> > only appear on insmod time... .
> 
> It shows-up w/ allyesconfig.

Yes.

> Are you saying it shows-up w/ modules 
> as well?

No.

> Soon we will have to pick on stack or the other to pursue.
> (We probably already should have done it...)

Well, I really think we can't stay with either stack.
We must begin to merge both stacks to get a usable one.
And we need to rewrite the userspace ABI. (as discussed in
other threads).
But I would suggest to wait until the Wireless Summit at OSDL
is done. It is still two months from now, but I hope it
is worth waiting for it.
Or do you think we should start merging frunctionatity _now_,
to get more practical experience for the summit?

Anyway. In my opinion best will be to:
Stay with current in-kernel code and begin to merge functionality
from dscape. I think the ieee80211 API has to be cleaned up, too.
There is too much private stuff in the public structures. d80211 is
better at this point.

-- 
Greetings Michael.

pgppTZ4IaHFkM.pgp
Description: PGP signature

Re: [wireless-2.6] d80211/ieee80211 symbol clash

2006-02-02 Thread John W. Linville

On Thu, Feb 02, 2006 at 08:36:26PM +0100, Michael Buesch wrote:
> Not sure, if I am the first one compiling the domesday branch,
> but the ieee80211_rx symbol clashes:
> 
> net/ieee80211/built-in.o: In function `ieee80211_rx':
> : multiple definition of `ieee80211_rx'
> net/d80211/built-in.o:: first defined here
> 
> Maybe people always compile the stacks as modules, so this does
> only appear on insmod time... .

It shows-up w/ allyesconfig.  Are you saying it shows-up w/ modules
as well?  Surely that is only if you load both the ieee80211 and
d80211 modules at the same time?

> But how to solve it? I suggest to change all dscape function prefixes
> from ieee80211_FOO to d80211_FOO.
> As bcm43xx is the only driver using it currently (AFAIK), this should
> be easiest and safest.

I'm open to this, but I had not considered this to be a priority
to fix.  I don't expect to be using the domesday branch forever.
Soon we will have to pick on stack or the other to pursue.
(We probably already should have done it...)

John
-- 
John W. Linville
[EMAIL PROTECTED]
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[wireless-2.6] d80211/ieee80211 symbol clash

2006-02-02 Thread Michael Buesch

Not sure, if I am the first one compiling the domesday branch,
but the ieee80211_rx symbol clashes:

net/ieee80211/built-in.o: In function `ieee80211_rx':
: multiple definition of `ieee80211_rx'
net/d80211/built-in.o:: first defined here

Maybe people always compile the stacks as modules, so this does
only appear on insmod time... .

But how to solve it? I suggest to change all dscape function prefixes
from ieee80211_FOO to d80211_FOO.
As bcm43xx is the only driver using it currently (AFAIK), this should
be easiest and safest.

-- 
Greetings Michael.


pgp3AmWdaGPFs.pgp
Description: PGP signature

Re: badness in dst_release

2006-02-02 Thread Dave Jones

On Wed, Feb 01, 2006 at 01:08:33PM -0500, Dave Jones wrote:
 > I managed to get a box running 2.6.16rc1-git4 to spit this out..
 > 
 >   Dave
 > 
 > UDP: bad checksum. From 192.168.79.115:43047 to 192.168.76.106:61494 ulen 
 > 1083
 > Badness in dst_release at include/net/dst.h:154 (Not tainted)
 >  [] __kfree_skb+0x36/0xdd
 >  [] ip_frag_destroy+0xe2/0x101 [] 
 > ip_defrag+0xb3/0xad1
 >  [] cache_alloc_debugcheck_after+0xc1/0xf9 [] 
 > ip_local_deliver+0x1f/0x1fe
 >  [] ip_rcv+0x401/0x478 [] 
 > netif_receive_skb+0x211/0x259
 >  [] process_backlog+0x7a/0x100 [] 
 > net_rx_action+0x99/0x170
 >  [] __do_softirq+0x58/0xc2 [] do_softirq+0x46/0x4e
 >  ===
 >  [] apic_timer_interrupt+0x1c/0x24 [] 
 > default_idle+0x0/0x55
 >  [] default_idle+0x2c/0x55 [] cpu_idle+0x8f/0xa8
 >  [] start_kernel+0x301/0x307<4>printk: 267 messages suppressed.

Here's a second flavour.

Badness in dst_release at include/net/dst.h:154 (Not tainted)
 [] __kfree_skb+0x36/0xdd [] ip_rcv+0x46d/0x478
 [] netif_receive_skb+0x211/0x259 [] 
process_backlog+0x7a/0x100
 [] net_rx_action+0x99/0x170 [] __do_softirq+0x58/0xc2
 [] do_softirq+0x46/0x4e<0> ===
 [] do_IRQ+0x72/0x7b
 [] common_interrupt+0x1a/0x20 [] default_idle+0x0/0x55
 [] default_idle+0x2c/0x55 [] cpu_idle+0x8f/0xa8
 [] start_kernel+0x301/0x307Badness in dst_release at 
include/net/dst.h:154 (Not tainted)
 [] __kfree_skb+0x36/0xdd
 [] ip_frag_destroy+0xe2/0x101 [] ip_defrag+0xb3/0xad1
 [] cache_alloc_debugcheck_after+0xc1/0xf9 [] 
ip_local_deliver+0x1f/0x1fe
 [] ip_rcv+0x401/0x478 [] netif_receive_skb+0x211/0x259
 [] process_backlog+0x7a/0x100 [] 
net_rx_action+0x99/0x170
 [] __do_softirq+0x58/0xc2 [] do_softirq+0x46/0x4e

These are incredibly easy to trigger with 'isic' for anyone wanting to chase 
these down.
In the space of a 3 day test, there are 9573 instances of that Badness message.

Dave

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

slab corruption in 2.6.16rc1-git4

2006-02-02 Thread Dave Jones

I've had a box being tortured with random junk packets (created with isic)
for a few days, and it spat this out last night..

Feb  1 04:28:09 trogdor kernel: Slab corruption: (Not tainted) start=cefc8a9c, 
len=244
Feb  1 04:28:09 trogdor kernel: Redzone: 0x5a2cf071/0x5a2cf071.
Feb  1 04:28:09 trogdor kernel: Last user: [](dst_destroy+0x7f/0xab)
Feb  1 04:28:09 trogdor kernel:  [] check_poison_obj+0x73/0x16a 
[] cache_alloc_debugcheck_after+0x22/0xf9
Feb  1 04:28:09 trogdor kernel:  [] kmem_cache_alloc+0x7d/0x86 
[] dst_alloc+0x27/0x7b
Feb  1 04:28:09 trogdor kernel:  [] dst_alloc+0x27/0x7b 
[] __ip_route_output_key+0x5a2/0x843
Feb  1 04:28:09 trogdor kernel:  [] issue_and_wait+0x28/0x93 [3c59x]  
   [] boomerang_start_xmit+0x31c/0x335 [3c59x]
Feb  1 04:28:09 trogdor kernel:  [] dev_queue_xmit+0x208/0x20f 
[] ip_route_output_flow+0x13/0x57
Feb  1 04:28:09 trogdor kernel:  [] ip_route_output_key+0x9/0xb 
[] icmp_send+0x282/0x397
Feb  1 04:28:09 trogdor kernel:  [] ip_route_input+0x3b/0xc6a 
[] _spin_lock_irqsave+0x9/0xd
Feb  1 04:28:09 trogdor kernel:  [] ip_options_compile+0x3da/0x3f3
 [] ip_rcv+0x322/0x478
Feb  1 04:28:09 trogdor kernel:  [] netif_receive_skb+0x211/0x259 
[] process_backlog+0x7a/0x100
Feb  1 04:28:09 trogdor kernel:  [] net_rx_action+0x99/0x170 
[] __do_softirq+0x58/0xc2
Feb  1 04:28:09 trogdor kernel:  [] do_softirq+0x46/0x4e<0> 
===
Feb  1 04:28:09 trogdor kernel:  [] do_IRQ+0x72/0x7b
Feb  1 04:28:09 trogdor kernel:  [] common_interrupt+0x1a/0x20 
[] default_idle+0x0/0x55
Feb  1 04:28:09 trogdor kernel:  [] default_idle+0x2c/0x55 
[] cpu_idle+0x8f/0xa8
Feb  1 04:28:09 trogdor kernel:  [] start_kernel+0x301/0x307
<3>000: 6b 6b 6b 6b 6a 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
Feb  1 04:28:09 trogdor kernel: Prev obj: start=cefc899c, len=244
Feb  1 04:28:09 trogdor kernel: Redzone: 0x5a2cf071/0x5a2cf071.
Feb  1 04:28:09 trogdor kernel: Last user: [](dst_destroy+0x7f/0xab)
Feb  1 04:28:09 trogdor kernel: 000: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
6b 6b
Feb  1 04:28:09 trogdor kernel: 010: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
6b 6b
Feb  1 04:28:09 trogdor kernel: Next obj: start=cefc8b9c, len=244
Feb  1 04:28:09 trogdor kernel: Redzone: 0x170fc2a5/0x170fc2a5.
Feb  1 04:28:09 trogdor kernel: Last user: [](dst_alloc+0x27/0x7b)
Feb  1 04:28:09 trogdor kernel: 000: 7c dd 63 cf 01 00 00 00 a6 a2 00 00 00 00 
00 00
Feb  1 04:28:09 trogdor kernel: 010: 00 b7 37 c0 00 00 02 00 01 00 00 00 56 a9 
1b 02

Note the first slab corruption line..

000: 6b 6b 6b 6b 6a 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b

has a single bit error, which _could_ be bad ram, as this box is an ancient
4-way pentium pro, so it's days may be numbered. I'll give it a spin with
memtest86 next time I'm at the office, but I wanted to report this just
in case, as the last few days I've been seeing a number of slab corruption
issues on different boxes, some of which I know are definitly ok wrt hardware 
problems.

Dave

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] TCP MTU probing

2006-02-02 Thread John Heffner

On Thursday 02 February 2006 13:23, John Heffner wrote:
> Implementation of packetization layer path mtu discovery for TCP, based on
> the internet-draft currently found at
> .

Fixed to turn off by default (oops).

Signed-off-by: John Heffner <[EMAIL PROTECTED]>
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -394,6 +394,8 @@ enum
 	NET_TCP_CONG_CONTROL=110,
 	NET_TCP_ABC=111,
 	NET_IPV4_IPFRAG_MAX_DIST=112,
+ 	NET_TCP_MTU_PROBING=113,
+	NET_TCP_BASE_MSS=114,
 };
 
 enum {
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -72,6 +72,7 @@ struct inet_connection_sock_af_ops {
  * @icsk_probes_out:	   unanswered 0 window probes
  * @icsk_ext_hdr_len:	   Network protocol overhead (IP/IPv6 options)
  * @icsk_ack:		   Delayed ACK control data
+ * @icsk_mtup;		   MTU probing control data
  */
 struct inet_connection_sock {
 	/* inet_sock has to be the first member! */
@@ -104,6 +105,18 @@ struct inet_connection_sock {
 		__u16		  last_seg_size; /* Size of last incoming segment	   */
 		__u16		  rcv_mss;	 /* MSS used for delayed ACK decisions	   */ 
 	} icsk_ack;
+	struct {
+		int		  enabled;
+		
+		/* Range of MTUs to search */
+		int		  search_high;
+		int		  search_low;
+		
+		/* Information on the current probe. */
+		int		  probe_size;
+		__u32		  probe_seq_start;
+		__u32		  probe_seq_end;
+	} icsk_mtup;
 	u32			  icsk_ca_priv[16];
 #define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
 };
diff --git a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -60,6 +60,9 @@ extern void tcp_time_wait(struct sock *s
 /* Minimal RCV_MSS. */
 #define TCP_MIN_RCVMSS		536U
 
+/* The least MTU to use for probing */
+#define TCP_BASE_MSS		512
+
 /* After receiving this amount of duplicate ACKs fast retransmit starts. */
 #define TCP_FASTRETRANS_THRESH 3
 
@@ -219,6 +222,8 @@ extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
 extern int sysctl_tcp_abc;
+extern int sysctl_tcp_mtu_probing;
+extern int sysctl_tcp_base_mss;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
@@ -447,6 +452,10 @@ extern int tcp_read_sock(struct sock *sk
 
 extern void tcp_initialize_rcv_mss(struct sock *sk);
 
+extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
+extern int tcp_mss_to_mtu(struct sock *sk, int mss);
+extern void tcp_mtup_init(struct sock *sk);
+
 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
 {
 	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -664,6 +664,22 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= NET_TCP_MTU_PROBING,
+		.procname	= "tcp_mtu_probing",
+		.data		= &sysctl_tcp_mtu_probing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_BASE_MSS,
+		.procname	= "tcp_base_mss",
+		.data		= &sysctl_tcp_base_mss,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 
 	{ .ctl_name = 0 }
 };
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1890,6 +1890,34 @@ static void tcp_try_to_open(struct sock 
 	}
 }
 
+static void tcp_mtup_probe_failed(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	
+	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
+	icsk->icsk_mtup.probe_size = 0;
+}
+
+static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	
+	/* FIXME: breaks with very large cwnd */
+	tp->prior_ssthresh = tcp_current_ssthresh(sk);
+	tp->snd_cwnd = tp->snd_cwnd *
+		   tcp_mss_to_mtu(sk, tp->mss_cache) /
+		   icsk->icsk_mtup.probe_size;
+	tp->snd_cwnd_cnt = 0;
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+	tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+	
+	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
+	icsk->icsk_mtup.probe_size = 0;
+	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+}
+
+
 /* Process an event, which can update packets-in-flight not trivially.
  * Main goal of this function is to calculate new estimate for left_out,
  * taking into account both packets sitting in receiver's buffer and
@@ -2022,6 +2050,17 @@ tcp_fastretrans_alert(struct sock *sk, u
 			return;
 		}
 
+		/* MTU probe failure: don't reduce cwnd */
+		if (icsk->icsk_ca_state < TCP_CA_CWR &&
+		icsk->icsk_mtup.probe_size &&
+		tp->snd_una == icsk->icsk_mtup.probe_seq_start) {
+			tcp_mtup_probe_failed(sk);
+			/* Restores the redu

instrumentation for TCP MTU probing

2006-02-02 Thread John Heffner

This is a patch which adds instruments to the TCP MTU probing.  It breaks 
netstat -s because it extends the line length of /proc/net/netstat past 1024 
characters, so it is not safe to apply.  I'm sending it for reference only at 
this point.

  -John

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -260,6 +260,12 @@ enum
 	LINUX_MIB_TCPABORTONLINGER,		/* TCPAbortOnLinger */
 	LINUX_MIB_TCPABORTFAILED,		/* TCPAbortFailed */
 	LINUX_MIB_TCPMEMORYPRESSURES,		/* TCPMemoryPressures */
+	LINUX_MIB_TCPMTUPROBESTALLS,		/* TCPMTUProbeStalls */
+	LINUX_MIB_TCPMTUPROBEFAILS,		/* TCPMTUProbeFails */
+	LINUX_MIB_TCPMTUPROBESUCCEEDS,		/* TCPMTUProbeSucceeds */
+	LINUX_MIB_TCPMTUPROBEINCONCLUSIVES,	/* TCPMTUProbeInconclusives */
+	LINUX_MIB_TCPMTUBLACKHOLEENABLES,	/* TCPMTUBlackholeEnables */
+	LINUX_MIB_TCPMTUBLACKHOLEREDUCTIONS,	/* TCPMTUBlackholeReductions */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -242,6 +242,12 @@ static const struct snmp_mib snmp4_net_l
 	SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
 	SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
 	SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
+	SNMP_MIB_ITEM("TCPMTUProbeStalls", LINUX_MIB_TCPMTUPROBESTALLS),
+	SNMP_MIB_ITEM("TCPMTUProbeFails", LINUX_MIB_TCPMTUPROBEFAILS),
+	SNMP_MIB_ITEM("TCPMTUProbeSucceeds", LINUX_MIB_TCPMTUPROBESUCCEEDS),
+	SNMP_MIB_ITEM("TCPMTUProbeInconclusives", LINUX_MIB_TCPMTUPROBEINCONCLUSIVES),
+	SNMP_MIB_ITEM("TCPMTUBlackholeEnables", LINUX_MIB_TCPMTUBLACKHOLEENABLES),
+	SNMP_MIB_ITEM("TCPMTUBlackholeReductions", LINUX_MIB_TCPMTUBLACKHOLEREDUCTIONS),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1894,6 +1894,7 @@ static void tcp_mtup_probe_failed(struct
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	
+	NET_INC_STATS_BH(LINUX_MIB_TCPMTUPROBEFAILS);
 	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
 	icsk->icsk_mtup.probe_size = 0;
 }
@@ -1903,6 +1904,7 @@ static void tcp_mtup_probe_success(struc
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	
+	NET_INC_STATS_BH(LINUX_MIB_TCPMTUPROBESUCCEEDS);
 	/* FIXME: breaks with very large cwnd */
 	tp->prior_ssthresh = tcp_current_ssthresh(sk);
 	tp->snd_cwnd = tp->snd_cwnd *
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1270,6 +1270,7 @@ static int tcp_write_xmit(struct sock *s
 	
 	/* Do MTU probing. */
 	if ((result = tcp_mtu_probe(sk)) == 0) {
+		NET_INC_STATS_BH(LINUX_MIB_TCPMTUPROBESTALLS);
 		return 0;
 	} else if (result > 0) {
 		sent_pkts = 1;
@@ -1651,6 +1652,7 @@ int tcp_retransmit_skb(struct sock *sk, 
 	
 	/* Inconslusive MTU probe */
 	if (icsk->icsk_mtup.probe_size) {
+		NET_INC_STATS_BH(LINUX_MIB_TCPMTUPROBEINCONCLUSIVES);
 		icsk->icsk_mtup.probe_size = 0;
 	}
 	
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -133,9 +133,11 @@ static int tcp_write_timeout(struct sock
 			/* Black hole detection */
 			if (sysctl_tcp_mtu_probing) {
 if (!icsk->icsk_mtup.enabled) {
+	NET_INC_STATS_BH(LINUX_MIB_TCPMTUBLACKHOLEENABLES);
 	icsk->icsk_mtup.enabled = 1;
 	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 } else {
+	NET_INC_STATS_BH(LINUX_MIB_TCPMTUBLACKHOLEREDUCTIONS);
 	mss = min(sysctl_tcp_base_mss,
 	  tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2);
 	mss = max(mss, 68 - tp->tcp_header_len);

Re: Van Jacobson net channels

2006-02-02 Thread Rick Jones


Andi Kleen wrote:

On Thursday 02 February 2006 08:31, Greg Banks wrote:



The tg3 driver uses small hardcoded values for the RXCOL_TICKS
and RXMAX_FRAMES registers, and allows "ethtool -C" to change
them.  SGI's solution is do is ship a script that uses ethtool
at boot to tune rx-usecs, rx-frames, rx-usecs-irq, rx-frames-irq
up from the defaults.



All user tuning like this is bad. The stack should all do that automatically.  
Would there be a drawback of making these

settings default?


Larger settings (even the defaults) of the coalescing parms, while giving decent 
CPU utilization for a bulk transfer and better CPU utilization for a large 
agregate workload seem to mean bad things for minimizing latency.


The "presentation" needs work but the data in:

ftp://ftp.cup.hp.com/dist/networking/briefs/nic_latency_vs_tput.txt

should show some of that.  The current executive summary:

Executive Summary:

By default, the e1000 driver used in conjunction with the A9900A PCI-X
Dual-port Gigabit Ethernet adaptor strongly favors maximum packet per
second throughput over minimum request/response latency.  Anyone
desiring lowest possible request/response latency needs to alter the
modprobe parameters used when the e1000 driver is loaded.  This
appears to reduce round-trip latency by as much as 85%.

However, configuring the A9900A PCI-X Dual-port Gigabit Ethernet
adaptor for minimum request/response latency will reduce maximum
packet per second performance (as measured with the netperf TCP_RR
test) by ~23% and increase the service demand for bulk data transfer
by ~63% for sending and ~145% for receiving.


there is also some data in there for tg3 and for xframe I (but with a rather 
behind the times driver, i'm still trying to get cycles to run with a newer driver)



This helps a lot, and we're very grateful ;-)   But a scheme
which used the interrupt mitigation hardware dynamically based on
load could reduce the irq rate and CPU usage even further without
compromising latency at low load.



If you know what's needed perhaps you could investigate it?


I'm guessing that any automagic interrupt mitigation scheme might want to know 
what it wants to enable for the single-stream TCP_RR transaction/s as the base 
pps before it starts holding-off interrupts.  Even then however, the ability for 
the user to overrride needs to remain because there may be a workload that wants 
that PPS rate, but isn't concerned about the latency, only the CPU utilization 
and so indeed wants the interrupts mitigated.  So it would seem that an 
automagic coalescing might be an N% solution, but I don't think it would be 
100%.  Question then becomes whether or not N is large enough to warrant it over 
 defaults+manual config.


rick jones
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] TCP MTU probing

2006-02-02 Thread John Heffner

Implementation of packetization layer path mtu discovery for TCP, based on the 
internet-draft currently found at 
.

Signed-off-by: John Heffner <[EMAIL PROTECTED]>

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -394,6 +394,8 @@ enum
 	NET_TCP_CONG_CONTROL=110,
 	NET_TCP_ABC=111,
 	NET_IPV4_IPFRAG_MAX_DIST=112,
+ 	NET_TCP_MTU_PROBING=113,
+	NET_TCP_BASE_MSS=114,
 };
 
 enum {
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -72,6 +72,7 @@ struct inet_connection_sock_af_ops {
  * @icsk_probes_out:	   unanswered 0 window probes
  * @icsk_ext_hdr_len:	   Network protocol overhead (IP/IPv6 options)
  * @icsk_ack:		   Delayed ACK control data
+ * @icsk_mtup;		   MTU probing control data
  */
 struct inet_connection_sock {
 	/* inet_sock has to be the first member! */
@@ -104,6 +105,18 @@ struct inet_connection_sock {
 		__u16		  last_seg_size; /* Size of last incoming segment	   */
 		__u16		  rcv_mss;	 /* MSS used for delayed ACK decisions	   */ 
 	} icsk_ack;
+	struct {
+		int		  enabled;
+		
+		/* Range of MTUs to search */
+		int		  search_high;
+		int		  search_low;
+		
+		/* Information on the current probe. */
+		int		  probe_size;
+		__u32		  probe_seq_start;
+		__u32		  probe_seq_end;
+	} icsk_mtup;
 	u32			  icsk_ca_priv[16];
 #define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
 };
diff --git a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -60,6 +60,9 @@ extern void tcp_time_wait(struct sock *s
 /* Minimal RCV_MSS. */
 #define TCP_MIN_RCVMSS		536U
 
+/* The least MTU to use for probing */
+#define TCP_BASE_MSS		512
+
 /* After receiving this amount of duplicate ACKs fast retransmit starts. */
 #define TCP_FASTRETRANS_THRESH 3
 
@@ -219,6 +222,8 @@ extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
 extern int sysctl_tcp_abc;
+extern int sysctl_tcp_mtu_probing;
+extern int sysctl_tcp_base_mss;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
@@ -447,6 +452,10 @@ extern int tcp_read_sock(struct sock *sk
 
 extern void tcp_initialize_rcv_mss(struct sock *sk);
 
+extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
+extern int tcp_mss_to_mtu(struct sock *sk, int mss);
+extern void tcp_mtup_init(struct sock *sk);
+
 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
 {
 	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -664,6 +664,22 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= NET_TCP_MTU_PROBING,
+		.procname	= "tcp_mtu_probing",
+		.data		= &sysctl_tcp_mtu_probing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_BASE_MSS,
+		.procname	= "tcp_base_mss",
+		.data		= &sysctl_tcp_base_mss,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 
 	{ .ctl_name = 0 }
 };
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1890,6 +1890,34 @@ static void tcp_try_to_open(struct sock 
 	}
 }
 
+static void tcp_mtup_probe_failed(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	
+	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
+	icsk->icsk_mtup.probe_size = 0;
+}
+
+static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	
+	/* FIXME: breaks with very large cwnd */
+	tp->prior_ssthresh = tcp_current_ssthresh(sk);
+	tp->snd_cwnd = tp->snd_cwnd *
+		   tcp_mss_to_mtu(sk, tp->mss_cache) /
+		   icsk->icsk_mtup.probe_size;
+	tp->snd_cwnd_cnt = 0;
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+	tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+	
+	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
+	icsk->icsk_mtup.probe_size = 0;
+	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+}
+
+
 /* Process an event, which can update packets-in-flight not trivially.
  * Main goal of this function is to calculate new estimate for left_out,
  * taking into account both packets sitting in receiver's buffer and
@@ -2022,6 +2050,17 @@ tcp_fastretrans_alert(struct sock *sk, u
 			return;
 		}
 
+		/* MTU probe failure: don't reduce cwnd */
+		if (icsk->icsk_ca_state < TCP_CA_CWR &&
+		icsk->icsk_mtup.probe_size &&
+		tp->snd_una == icsk->icsk_mtup.probe_seq_start) {
+			tcp_mtup_probe_failed(sk);
+			/* Restores the reduction we did in tcp_mtup_probe() */
+			tp->snd_cwnd++;
+			tcp_simple_retransmit(sk);
+			return

RE: Van Jacobson net channels

2006-02-02 Thread Robert Olsson

Leonid Grossman writes:

 > Right. Interrupt moderation is done on per channel basis. 
 > The only addition to the current NAPI mechanism I'd like to see is to
 > have NAPI setting desired interrupt rate (once interrupts are ON),
 > rather than use an interrupt per packet or a driver default. Arguably,
 > NAPI can figure out desired interrupt rate a bit better than a driver
 > can.

 In the current scheme a driver can easily use a dynamic interrupt scheme
 in fact tulip has used this for years. At low rates there are now delays at 
 all if reach some threshold it increases interrupt latency. It can be done 
 in sevaral levels. The best threshold seems luckily just to be to count 
 the number of packets sitting RX ring when ->poll is called. Jamal heavily 
 experimented with this and gave a talk at OLS 2000.

 Yes if net channel classifier runs in  hardirq we get back to the livelock 
 situation sooner or later. IMO interupts should just be a signal to 
 indicate work

 Cheers.
--ro

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Van Jacobson net channels

2006-02-02 Thread Rick Jones


Oh you have TSO disabled?  That explains a lot.

Yes, it's been a bumpy road, and there are still some
e1000 lockups, but in general things should be smooth
these days.


I suspect that "these days" in kernel.org terms differs somewhat from "these 
days" RH/SuSE/etc terms, hence TSO being disabled.


rick jones

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Van Jacobson net channels

2006-02-02 Thread Stephen Hemminger

On Wed, 01 Feb 2006 16:29:11 -0800 (PST)
"David S. Miller" <[EMAIL PROTECTED]> wrote:

> From: Stephen Hemminger <[EMAIL PROTECTED]>
> Date: Wed, 1 Feb 2006 16:12:14 -0800
> 
> > The bigger problem I see is scalability.  All those mmap rings have to
> > be pinned in memory to be useful. It's fine for a single smart application
> > per server environment, but in real world with many dumb thread monster
> > applications on a single server it will be really hard to get working.
> 
> This is no different from when the thread blocks and the receive queue
> fills up, and in order to absorb scheduling latency.  We already lock
> memory into the kernel for socket buffer memory as it is.  At least
> the mmap() ring buffer method is optimized and won't have all of the
> overhead for struct sk_buff and friends.  So we have the potential to
> lock down less memory not more.
> 
> This is just like when we started using BK or GIT for source
> management, everyone was against it and looking for holes while they
> tried to wrap their brains around the new concepts and ideas.  I guess
> it will take a while for people to understand all this new stuff, but
> we'll get there.

No, it just means we have to cover our bases and not regress while
moving forward.  Not that we never have any regressions ;=)

-- 
Stephen Hemminger <[EMAIL PROTECTED]>
OSDL http://developer.osdl.org/~shemminger
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Van Jacobson net channels and NIC channels

2006-02-02 Thread Andi Kleen

On Thursday 02 February 2006 17:27, Leonid Grossman wrote:

> By now we have submitted UFO, MSI-X and LRO patches.  The one item on
> the TODO list that we did not submit a full driver patch for is the
> "support for distributing receive processing across multiple CPUs (using
> NIC hw queues)", mainly because at present the feature can't be fully
> used by the host anyways.

Why are you saying it can't be used by the host? The stack should
be fully ready for it.

The only small piece missing is a way to set the IRQ affinity from the kernel,
but that can be simulated from user space by tweaking them in /proc. If you 
have a prototype patch adding the kernel interfaces wouldn't be that hard 
neither.

Also how about per CPU TX completion interrupts? 

-Andi
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Kernel BUG at drivers/net/tg3.c:2914 on SMP amd64

2006-02-02 Thread Michael Chan

On Thu, 2006-02-02 at 13:37 +, Mike Crowe wrote:
> I'm running the Debian 2.6.15 kernel from backports.org on a machine
> with two Opteron 275s. I am getting a BUG in tg3.c quite reliably if I
> ping flood the machine from a few others and cause a bit of other
> network activity. Sometimes it takes a few minutes, sometimes half an
> hour. The BUG also fires in more realistic situations - it just takes
> longer to reproduce.

Most likely due to MMIO being re-ordered. We've seen this on a number of
AMD machines.

Please try this test patch below. If the problem goes away, send me the
output of lspci -vvvxxx on your machine and I'll create a patch to fix
this automatically on your machine. Thanks.

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index f2d1daf..de456ae 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -9557,6 +9557,9 @@ static int __devinit tg3_get_invariants(
!(tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS))
tp->tg3_flags |= TG3_FLAG_MBOX_WRITE_REORDER;
 
+   /* test patch to unconditionally set the flag */
+   tp->tg3_flags |= TG3_FLAG_MBOX_WRITE_REORDER;
+
if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5703 &&
tp->pci_lat_timer < 64) {
tp->pci_lat_timer = 64;


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] wireless/atmel: fix setting TX key only in ENCODEEXT

2006-02-02 Thread Dan Williams

The previous patch that added ENCODEEXT and AUTH support to the atmel
driver contained a slight error which would cause just setting the TX
key index to also set the encryption key again.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>


--- a/drivers/net/wireless/atmel.c  2006-02-02 00:58:44.0 -0500
+++ b/drivers/net/wireless/atmel.c  2006-02-02 10:47:15.0 -0500
@@ -1834,7 +1834,7 @@
struct atmel_private *priv = netdev_priv(dev);
struct iw_point *encoding = &wrqu->encoding;
struct iw_encode_ext *ext = (struct iw_encode_ext *)extra;
-   int idx, key_len;
+   int idx, key_len, alg = ext->alg;
 
/* Determine and validate the key index */
idx = encoding->flags & IW_ENCODE_INDEX;
@@ -1845,39 +1845,39 @@
} else
idx = priv->default_key;
 
-   if ((encoding->flags & IW_ENCODE_DISABLED) ||
-   ext->alg == IW_ENCODE_ALG_NONE) {
-   priv->wep_is_on = 0;
-   priv->encryption_level = 0;
-   priv->pairwise_cipher_suite = CIPHER_SUITE_NONE;
-   }
+   if (encoding->flags & IW_ENCODE_DISABLED)
+   alg = IW_ENCODE_ALG_NONE;
 
-   if (ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY)
+   if (ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY) {
priv->default_key = idx;
-
-   /* Set the requested key first */
-   switch (ext->alg) {
-   case IW_ENCODE_ALG_NONE:
-   break;
-   case IW_ENCODE_ALG_WEP:
-   if (ext->key_len > 5) {
-   priv->wep_key_len[idx] = 13;
-   priv->pairwise_cipher_suite = CIPHER_SUITE_WEP_128;
-   priv->encryption_level = 2;
-   } else if (ext->key_len > 0) {
-   priv->wep_key_len[idx] = 5;
-   priv->pairwise_cipher_suite = CIPHER_SUITE_WEP_64;
-   priv->encryption_level = 1;
-   } else {
+   } else {
+   /* Set the requested key first */
+   switch (alg) {
+   case IW_ENCODE_ALG_NONE:
+   priv->wep_is_on = 0;
+   priv->encryption_level = 0;
+   priv->pairwise_cipher_suite = CIPHER_SUITE_NONE;
+   break;
+   case IW_ENCODE_ALG_WEP:
+   if (ext->key_len > 5) {
+   priv->wep_key_len[idx] = 13;
+   priv->pairwise_cipher_suite = 
CIPHER_SUITE_WEP_128;
+   priv->encryption_level = 2;
+   } else if (ext->key_len > 0) {
+   priv->wep_key_len[idx] = 5;
+   priv->pairwise_cipher_suite = 
CIPHER_SUITE_WEP_64;
+   priv->encryption_level = 1;
+   } else {
+   return -EINVAL;
+   }
+   priv->wep_is_on = 1;
+   memset(priv->wep_keys[idx], 0, 13);
+   key_len = min ((int)ext->key_len, 
priv->wep_key_len[idx]);
+   memcpy(priv->wep_keys[idx], ext->key, key_len);
+   break;
+   default:
return -EINVAL;
}
-   priv->wep_is_on = 1;
-   memset(priv->wep_keys[idx], 0, 13);
-   key_len = min ((int)ext->key_len, priv->wep_key_len[idx]);
-   memcpy(priv->wep_keys[idx], ext->key, key_len);
-   break;
-   default:
-   return -EINVAL;
}
 
return -EINPROGRESS;


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: Van Jacobson net channels

2006-02-02 Thread Leonid Grossman

 

> -Original Message-
> From: Eric W. Biederman [mailto:[EMAIL PROTECTED] 


> How do you classify channels?

Multiple rx steering criterias are available, for example tcp tuple (or
subset) hash, direct tcp tuple (or subset) match, MAC address, pkt size,
vlan tag, QOS bits, etc.

> 
> If your channels can map directly to the VAN Jacobsen 
> channels then when the kernel starts using them, it sounds 
> like the ideal strategy is to use the current NAPI algorithm 
> of disabling interrupts (on a per channel basis (assuming 
> MSI-X here) until that channel gets caught up Then enable 
> interrupts again.

Right. Interrupt moderation is done on per channel basis. 
The only addition to the current NAPI mechanism I'd like to see is to
have NAPI setting desired interrupt rate (once interrupts are ON),
rather than use an interrupt per packet or a driver default. Arguably,
NAPI can figure out desired interrupt rate a bit better than a driver
can.

> 
> I wonder if someone could make that the default policy in their NICs?

Some NICs can support this today.
If there is a consensus on a channel-aware NIC driver interface
(including interrupt mgmt per channel), this will become a default NIC
implementation. Over time, NIC development is always driven by the
OS/stack requirements.

> 
> Eric
> 
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] wireless/atmel: fix authentication process bugs

2006-02-02 Thread Dan Williams

This patch fixes a number of bugs in the authentication process:

1) When falling back to Shared Key authentication mode from Open System,
a missing 'return' would cause the auth request to be sent, but would
drop the card into Management Error state.  When falling back, the
driver should also indicate that it is switching to Shared Key mode by
setting exclude_unencrypted.

2) Initial authentication modes were apparently wrong in some cases,
causing the driver to attempt Shared Key authentication mode when in
fact the access point didn't support that mode or even had WEP disabled.
The driver should set the correct initial authentication mode based on
wep_is_on and exclude_unencrypted.

3) Authentication response packets from the access point in Open System
mode were getting ignored because the driver was expecting the sequence
number of a Shared Key mode response.  The patch separates the OS and SK
mode handling to provide the correct behavior.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>


--- a/drivers/net/wireless/atmel.c  2006-02-02 00:58:44.0 -0500
+++ b/drivers/net/wireless/atmel.c  2006-02-02 10:47:15.0 -0500
@@ -3023,17 +3023,26 @@
}
 
if (status == WLAN_STATUS_SUCCESS && priv->wep_is_on) {
+   int should_associate = 0;
/* WEP */
if (trans_seq_no != priv->ExpectedAuthentTransactionSeqNum)
return;
 
-   if (trans_seq_no == 0x0002 &&
-   auth->el_id == MFIE_TYPE_CHALLENGE) {
-   send_authentication_request(priv, system, 
auth->chall_text, auth->chall_text_len);
-   return;
+   if (system == WLAN_AUTH_OPEN) {
+   if (trans_seq_no == 0x0002) {
+   should_associate = 1;
+   }
+   } else if (system == WLAN_AUTH_SHARED_KEY) {
+   if (trans_seq_no == 0x0002 &&
+   auth->el_id == MFIE_TYPE_CHALLENGE) {
+   send_authentication_request(priv, system, 
auth->chall_text, auth->chall_text_len);
+   return;
+   } else if (trans_seq_no == 0x0004) {
+   should_associate = 1;
+   }
}
 
-   if (trans_seq_no == 0x0004) {
+   if (should_associate) {
if(priv->station_was_associated) {
atmel_enter_state(priv, 
STATION_STATE_REASSOCIATING);
send_association_request(priv, 1);
@@ -3048,9 +3057,11 @@
 
if (status == WLAN_STATUS_NOT_SUPPORTED_AUTH_ALG) {
/* Do opensystem first, then try sharedkey */
-   if (system ==  WLAN_AUTH_OPEN) {
+   if (system == WLAN_AUTH_OPEN) {
priv->CurrentAuthentTransactionSeqNum = 0x001;
+   priv->exclude_unencrypted = 1;
send_authentication_request(priv, WLAN_AUTH_SHARED_KEY, 
NULL, 0);
+   return;
} else if (priv->connect_to_any_BSS) {
int bss_index;
 
@@ -3401,10 +3412,13 @@
priv->AuthenticationRequestRetryCnt = 0;
restart_search(priv);
} else {
+   int auth = WLAN_AUTH_OPEN;
priv->AuthenticationRequestRetryCnt++;
priv->CurrentAuthentTransactionSeqNum = 0x0001;
mod_timer(&priv->management_timer, jiffies + 
MGMT_JIFFIES);
-   send_authentication_request(priv, WLAN_AUTH_OPEN, NULL, 
0);
+   if (priv->wep_is_on && priv->exclude_unencrypted)
+   auth = WLAN_AUTH_SHARED_KEY;
+   send_authentication_request(priv, auth, NULL, 0);
  }
  break;
 
@@ -3503,12 +3517,15 @@
priv->station_was_associated = 
priv->station_is_associated;
atmel_enter_state(priv, STATION_STATE_READY);
} else {
+   int auth = WLAN_AUTH_OPEN;
priv->AuthenticationRequestRetryCnt = 0;
atmel_enter_state(priv, 
STATION_STATE_AUTHENTICATING);
 
mod_timer(&priv->management_timer, jiffies + 
MGMT_JIFFIES);
priv->CurrentAuthentTransactionSeqNum = 0x0001;
-   send_authentication_request(priv, 
WLAN_AUTH_SHARED_KEY, NULL, 0);
+   if (priv->wep_is_on && 
priv->exclude_unencrypted)
+   auth = WLAN_AUTH_SHARED_KEY;
+   send_authentication_request(priv, auth, NULL, 
0);
}

[PATCH 0/2] wireless/atmel: WEXT and authentication fixes

2006-02-02 Thread Dan Williams

These two patches fix bugs in the Atmel driver's handing of WEXT and
authentication.  The first fixes setting of the TX key only for the
ENCODEEXT functionality I added last week.  The second fixes some bugs
in the authentication process for both WEP and non-WEP configurations.

Summary:

[PATCH 1/2] wireless/atmel: fix setting TX key only in ENCODEEXT
[PATCH 2/2] wireless/atmel: fix authentication process bugs

Thanks,
Dan


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Van Jacobson net channels and NIC channels

2006-02-02 Thread Leonid Grossman

Thanks to Andi, Dave, Jeff and everyone who responded to the original
query; I've got enough pointers to presentations, blogs and ideas to
keep me busy for a while :-)

VJ channels indeed seem to compliment and take to a different level some
sw and hw ideas on Dave's TODO list.

By now we have submitted UFO, MSI-X and LRO patches.  The one item on
the TODO list that we did not submit a full driver patch for is the
"support for distributing receive processing across multiple CPUs (using
NIC hw queues)", mainly because at present the feature can't be fully
used by the host anyways.

If some channel-oriented changes to the stack/NAPI do converge, it may
be worthwhile to extend the interface to the driver/hw channels as well.


To this end, we can submit a proposal and a reference implementation for
the driver interface that would support driver/hw channels (one per
cpu). Some of the channel capabilities will be as follows:

- configurable subset of tx/rx descriptors per channel
- separate msi-x interrupt(s) per channel 
- separate interrupt moderation scheme per channel, potentially driven
by NAPI.
- rx traffic classification; we will probably start from couple relevant
types like hash-based and direct socket match based.

If nothing else, this implementation will allow to collect some
performance data on hw channel acceleration and see what works and what
doesn't :-)

As a second stage, we can allow the driver channels to be used as a
separate eth interface (using multiple MAC addresses and MAC-based rx
classification), for virtualization frameworks. 

Stay tuned; any comments/suggestions are welcome
Leonid
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Van Jacobson net channels

2006-02-02 Thread Stephen Hemminger

On Thu, 02 Feb 2006 08:35:28 -0700
[EMAIL PROTECTED] (Eric W. Biederman) wrote:

> "Christopher Friesen" <[EMAIL PROTECTED]> writes:
> 
> > Eric W. Biederman wrote:
> >> Jeff Garzik <[EMAIL PROTECTED]> writes:
> >
> >>> This was discussed on the netdev list, and the conclusion was that
> >>> you want both NAPI and hw mitigation.  This was implemented in a
> >>> few drivers, at least.
> >
> >> How does that deal with the latency that hw mitigation introduces. When you
> >> have a workload that bottle-necked waiting for that next
> >> packet and hw mitigation is turned on  you can see some horrible
> >> unjustified slow downs.
> >
> > Presumably at low traffic you would disable hardware mitigation to get the 
> > best
> > possible latency.  As traffic ramps up you tune the hardware mitigation
> > appropriately.  At high traffic loads, you end up with full hardware 
> > mitigation,
> > but you have enough packets coming in that the latency is minimal.
> 
> The evil but real work load is when you have a high volume of dependent 
> traffic.
> RPC calls or MPI collectives are cases where you are likely to see this.
> 
> Or even in TCP there is an element that once you hit your window limit you 
> won't
> send more traffic until you get your ack.  But if you don't get your ack
> because the interrupt is mitigated.
> 
> NAPI handles this beautifully.  It disables the interrupts until it knows it
> needs to process more packets.  Then when it is just waiting around for
> packets from that card it enables interrupts on that card.

Also, NAPI handles the case where receiver is getting DoS or overrrun with 
packets,
and you want the hardware to send flow control. Without NAPI it is easy to get
stuck only processing packets and nothing else.

I hope the VJ channels code has receive flow control.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Van Jacobson net channels

2006-02-02 Thread Eric W. Biederman

"Leonid Grossman" <[EMAIL PROTECTED]> writes:

> There two facilities (at least, in our ASIC, but there is no reason this
> can't be part of the generic multi-channel driver interface that I will
> get to shortly) to deal with it.
>
> - hardware supports more than one utilization-based interrupt rate (we
> have four). For lowest utilization range, we always set interrupt rate
> to one interrupt for every rx packet - exactly for the latency reasons
> that you are bringing up. Also, cpu is not busy anyways so extra
> interrupts do not hurt much. For highest utilization range, we set the
> rate by default to something like an interrupt per 128 packets. There is
> also timer-based interrupt, as a last resort option.
> As I mentioned earlier, it would be cool to get these moderation
> tresholds from NAPI, since it can make a better guess about the overall
> system utilization than the driver can. But even at the driver level,
> this works reasonably well.
>
> - the moderation scheme is implemented in the ASIC on per channel basis.
> So, if you have workloads with very distinct latency needs, you can just
> steer it to a separate channel and have an interrupt moderation that is
> different from other flows, for example keep an interrupt per packet
> always.

How do you classify channels?

If your channels can map directly to the VAN Jacobsen channels then
when the kernel starts using them, it sounds like the ideal strategy is
to use the current NAPI algorithm of disabling interrupts (on a per
channel basis (assuming MSI-X here) until that channel gets caught up
Then enable interrupts again.

I wonder if someone could make that the default policy in their NICs?

Eric
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Van Jacobson net channels

2006-02-02 Thread Eric W. Biederman

"Christopher Friesen" <[EMAIL PROTECTED]> writes:

> Eric W. Biederman wrote:
>> Jeff Garzik <[EMAIL PROTECTED]> writes:
>
>>> This was discussed on the netdev list, and the conclusion was that
>>> you want both NAPI and hw mitigation.  This was implemented in a
>>> few drivers, at least.
>
>> How does that deal with the latency that hw mitigation introduces. When you
>> have a workload that bottle-necked waiting for that next
>> packet and hw mitigation is turned on  you can see some horrible
>> unjustified slow downs.
>
> Presumably at low traffic you would disable hardware mitigation to get the 
> best
> possible latency.  As traffic ramps up you tune the hardware mitigation
> appropriately.  At high traffic loads, you end up with full hardware 
> mitigation,
> but you have enough packets coming in that the latency is minimal.

The evil but real work load is when you have a high volume of dependent traffic.
RPC calls or MPI collectives are cases where you are likely to see this.

Or even in TCP there is an element that once you hit your window limit you won't
send more traffic until you get your ack.  But if you don't get your ack
because the interrupt is mitigated.

NAPI handles this beautifully.  It disables the interrupts until it knows it
needs to process more packets.  Then when it is just waiting around for
packets from that card it enables interrupts on that card.

Eric
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: Van Jacobson net channels

2006-02-02 Thread Leonid Grossman

 

> -Original Message-
> From: Andi Kleen [mailto:[EMAIL PROTECTED] 

> > You just need to make sure that you don't leak data from 
> other peoples 
> > sockets.
> 
> There are three basic ways I can see to do this:
> 
> -  You have really advanced hardware which can potentially 
> manage tens of thousands of hardware queues with full 
> classification down to the ports. Then everything is great. 
> But who has such hardware?
> Perhaps Leonid will do it, but I expect the majority of Linux 
> users to not have access to it in the forseeable time. Also 
> even with the advanced hardware that can handle e.g. 50k 
> sockets what happens when you need 100k for some extreme situation?
> 
> -Andi

You may be surprised here :-) iWAPP (RDMA over Ethernet) received a lot
of funding and industry support over last several years, and rNIC
development is already pre-announced by multiple vendors not just us.
 
I expect RDMA deployment to be a long and bumpy multi-year road, since
protocols and applications will need to change to take full advantage of
it. And this is a discussion for a totally separate thread anyways :-)

But in the meantime, these new ethernet adapters will have huge number
of hw queue pairs (AKA channels), and at least some of the NICs will
have these channels at no incremental cost to the hardware. You may be
able to use the channels for full socket traffic classification if
nothing else, and defer the rest of rNIC functionality until the iWARP
infrastructure is mature. 

This is actually one of many reasons why VJ net channels and related
ideas look very promising - we can "extend" it to the driver/hw level
with the current NICs that have at least one channel per cpu, with a
good chance that the next wave of hardware will support many more
channels and will take advantage of the stack/NAPI improvements.  
Leonid
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Van Jacobson net channels

2006-02-02 Thread Christopher Friesen


Eric W. Biederman wrote:

Jeff Garzik <[EMAIL PROTECTED]> writes:



This was discussed on the netdev list, and the conclusion was that
you want both NAPI and hw mitigation.  This was implemented in a
few drivers, at least.


How does that deal with the latency that hw mitigation introduces. 
When you have a workload that bottle-necked waiting for that next

packet and hw mitigation is turned on  you can see some horrible
unjustified slow downs.


Presumably at low traffic you would disable hardware mitigation to get 
the best possible latency.  As traffic ramps up you tune the hardware 
mitigation appropriately.  At high traffic loads, you end up with full 
hardware mitigation, but you have enough packets coming in that the 
latency is minimal.


Chris
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: Van Jacobson net channels

2006-02-02 Thread Leonid Grossman

 

> -Original Message-
> From: Eric W. Biederman [mailto:[EMAIL PROTECTED] 
> Sent: Thursday, February 02, 2006 4:29 AM
> To: Jeff Garzik
> Cc: Andi Kleen; Greg Banks; David S. Miller; Leonid Grossman; 
> [EMAIL PROTECTED]; Linux Network Development list
> Subject: Re: Van Jacobson net channels
> 
> Jeff Garzik <[EMAIL PROTECTED]> writes:
> 
> > Andi Kleen wrote:
> >> There was already talk some time ago to make NAPI drivers use the 
> >> hardware mitigation again. The reason is when you have
> >
> >
> > This was discussed on the netdev list, and the conclusion 
> was that you 
> > want both NAPI and hw mitigation.  This was implemented in 
> a few drivers, at least.
> 
> How does that deal with the latency that hw mitigation introduces.
> When you have a workload that bottle-necked waiting for that 
> next packet and hw mitigation is turned on  you can see some 
> horrible unjustified slow downs.

There two facilities (at least, in our ASIC, but there is no reason this
can't be part of the generic multi-channel driver interface that I will
get to shortly) to deal with it.

- hardware supports more than one utilization-based interrupt rate (we
have four). For lowest utilization range, we always set interrupt rate
to one interrupt for every rx packet - exactly for the latency reasons
that you are bringing up. Also, cpu is not busy anyways so extra
interrupts do not hurt much. For highest utilization range, we set the
rate by default to something like an interrupt per 128 packets. There is
also timer-based interrupt, as a last resort option.
As I mentioned earlier, it would be cool to get these moderation
tresholds from NAPI, since it can make a better guess about the overall
system utilization than the driver can. But even at the driver level,
this works reasonably well.

- the moderation scheme is implemented in the ASIC on per channel basis.
So, if you have workloads with very distinct latency needs, you can just
steer it to a separate channel and have an interrupt moderation that is
different from other flows, for example keep an interrupt per packet
always.

Leonid

> 
> Eric
> 
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Kernel BUG at drivers/net/tg3.c:2914 on SMP amd64

2006-02-02 Thread Mike Crowe

I'm running the Debian 2.6.15 kernel from backports.org on a machine
with two Opteron 275s. I am getting a BUG in tg3.c quite reliably if I
ping flood the machine from a few others and cause a bit of other
network activity. Sometimes it takes a few minutes, sometimes half an
hour. The BUG also fires in more realistic situations - it just takes
longer to reproduce.

The machine has a Tyan S2882-D motherboard with dual BCM5704C network
interfaces. Only one is in use.

The Debian 2.6.15 version of tg.c seems to be identical to the one in
stock 2.6.15. It doesn't look like either of the two changes to tg3.c
in 2.6.16-rc1 are likely to be related to this.

I'm not subscribed to this list - please let me know if there is any
more information I can provide or if there are any patches you'd like
me to try.

TIA


Kernel BUG at drivers/net/tg3.c:2914
invalid operand:  [1] SMP
CPU 3
Modules linked in: ipt_REJECT ip_tables nfsd nfs lockd nfs_acl sunrpc
autofs4 ipv6 megaraid dm_mod ide_disk joydev psmouse evdev i2c_amd756
hw_random i2c_amd811 1 serio_raw i2c_core pcspkr shpchp pci_hotplug
xfs exportfs ide_cd cdrom ide_generic sd_mod generic e100
megaraid_mbox scsi_mod amd74xx ohci_hcd mii tg3 megarai d_mm ide_core
thermal processor fan

Pid: 0, comm: swapper Not tainted 2.6.15-1-amd64-k8-smp #1
RIP: 0010:[] {:tg3:tg3_tx+59}
RSP: 0018:8101044ffeb8  EFLAGS: 00010246
RAX: 0148 RBX: 8101fd93cec0 RCX: 0001
RDX: 8100f8bf8580 RSI: 81011000 RDI: 8101044dc6d0
RBP:  R08: 8100f7db21b8 R09: 0009
R10: 0001 R11: 0001 R12: 0148
R13: 8100f9fa0440 R14:  R15: 
FS:  2af15090() GS:8040a980() knlGS:
CS:  0010 DS: 0018 ES: 0018 CR0: 8005003b
CR2: 2aac CR3: 00101000 CR4: 06e0
Process swapper (pid: 0, threadinfo 81000c058000, task 81000c052100)
Stack: 81000c058000 8100f9f26000 8100f9fa0440 8100f9fa
   8101044fff2c  8101044fff88 88044e08
   8100f9fa 8100f9fa
Call Trace:  {:tg3:tg3_poll+98} 
{net_rx_action+172}
   {__do_softirq+80} {call_softirq+31}
   {do_softirq+47} {do_IRQ+50}
   {ret_from_intr+0}   
{thread_return+0}
   {default_idle+57} {cpu_idle+93}


Code: 0f 0b 68 98 f2 04 88 c2 62 0b 49 8b 4d 40 8b 95 80 00 00 00
RIP {:tg3:tg3_tx+59} RSP 
 <0>Kernel panic - not syncing: Aiee, killing interrupt handler!

--
Mike Crowe
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: How can I get local copy of your git tree?

2006-02-02 Thread John W. Linville

On Thu, Feb 02, 2006 at 12:16:03PM +0100, Jochen Friedrich wrote:
> Hi Denis,
> 
> >Hi John,
> ># cg-clone 
> >rsync://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git 
> >wireless-2.6.git
> >  
> >
> 
> the branch softmac has been renamed to softmac-all.
> 
> # cg-clone 
> "rsync://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git#softmac-all"
>  wireless-2.6.git

Yes, sorry about that!  I had meant to announce the branch renaming,
but it slipped... :-(

Basically, there are a number of driver branches that are
named *-softmac, and a branch for the softmac code itself named
softmac-upstream.  Those all get pulled into the softmac-all branch.
There are an analagous set of branches for the Devicescape code.

Finally, both softmac-all and dscape-all are pulled (along with
upstream) into the domesday branch.

Hth!

John

P.S.  The *-softmac and *-dscape branches and even the *-all branches
are mostly for my sanity.  Most people really only need to worry
about the domesday, upstream, upstream-fixes and/or master branches
(depending on your level of interest).
-- 
John W. Linville
[EMAIL PROTECTED]
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] ipv6: addrconf_ifdown fix dst refcounting.

2006-02-02 Thread Herbert Xu

On Thu, Feb 02, 2006 at 05:37:22AM -0700, Eric W. Biederman wrote:
>
> > Yes you are right.  The locking/refcounting in addrconf.c is such
> > a mess.  I've asked a number of times before as to why most of
> > this can't be done in user-space instead.  There is nothing performance
> > critical here, and the system must be able to deal with a device with
> > no IPv6 addresses anyway (think of the case when the device was up before
> > ipv6.ko was loaded). 
> 
> A lot of the latter case is handled by the replay of netdevice events
> when you register a netdevice notifier.

Yes.  What I meant is that it is normal to have a period of time during
which a device has no IPv6 addresses attached.  Doing addrconf in the
kernel means that we can guarantee that as soon as a device appears we
slap on an IPv6 address.  My point is that we need to cope with devices
without IPv6 addresses anyway.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/0] AES-XCBC-MAC

2006-02-02 Thread Herbert Xu

On Fri, Jan 27, 2006 at 09:17:24PM +0900, Kazunori Miyazawa wrote:
> 
> I resend following patches to the kernel support AES-XCBC-MAC.
> These patches can probably support another XCBC algorithms
> but I only tested with AES on the test vectors of RFC3566.

Thanks for the patch.  It turns out that I need to implement the
parameterised algorithms for the async crypto API anyway.  So if
it's not too much trouble I'd like to work on that over the next
couple of weeks and then integrate AES-XCBC-MAC as a parametrised
crypto algorithm.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] ipv6: addrconf_ifdown fix dst refcounting.

2006-02-02 Thread Eric W. Biederman

Herbert Xu <[EMAIL PROTECTED]> writes:

> On Fri, Jan 27, 2006 at 01:00:49AM -0700, Eric W. Biederman wrote:
>> 
>> However I do know I have correctly found the leak.
>
> Yes you are right.  The locking/refcounting in addrconf.c is such
> a mess.  I've asked a number of times before as to why most of
> this can't be done in user-space instead.  There is nothing performance
> critical here, and the system must be able to deal with a device with
> no IPv6 addresses anyway (think of the case when the device was up before
> ipv6.ko was loaded). 

A lot of the latter case is handled by the replay of netdevice events
when you register a netdevice notifier.

Eric
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Van Jacobson net channels

2006-02-02 Thread Eric W. Biederman

Jeff Garzik <[EMAIL PROTECTED]> writes:

> Andi Kleen wrote:
>> There was already talk some time ago to make NAPI drivers use
>> the hardware mitigation again. The reason is when you have
>
>
> This was discussed on the netdev list, and the conclusion was that you want 
> both
> NAPI and hw mitigation.  This was implemented in a few drivers, at least.

How does that deal with the latency that hw mitigation introduces.
When you have a workload that bottle-necked waiting for that next packet
and hw mitigation is turned on  you can see some horrible unjustified
slow downs.

Eric
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Docs: documentation for new arp_accept sysctl variable

2006-02-02 Thread Neil Horman

As John pointed out, I had not added documentation to describe the arp_accpet
sysctl that I posted in my last patch.  This patch adds that documentation.

Thanks & Regards
Neil

Signed-off-by: Neil Horman <[EMAIL PROTECTED]>

 ip-sysctl.txt |5 +
 1 files changed, 5 insertions(+)


diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -602,6 +602,11 @@ arp_ignore - INTEGER
The max value from conf/{all,interface}/arp_ignore is used
when ARP request is received on the {interface}
 
+arp_accept - BOOLEAN
+   Define behavior when gratuitous arp replies are received:
+   0 - drop gratuitous arp frames
+   1 - accept gratuitous arp frames 
+
 app_solicit - INTEGER
The maximum number of probes to send to the user space ARP daemon
via netlink before dropping back to multicast probes (see
-- 
/***
 *Neil Horman
 *Software Engineer
 *gpg keyid: 1024D / 0x92A74FA1 - http://pgp.mit.edu
 ***/
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] ipv6: addrconf_ifdown fix dst refcounting.

2006-02-02 Thread Herbert Xu

On Fri, Jan 27, 2006 at 01:00:49AM -0700, Eric W. Biederman wrote:
> 
> However I do know I have correctly found the leak.

Yes you are right.  The locking/refcounting in addrconf.c is such
a mess.  I've asked a number of times before as to why most of
this can't be done in user-space instead.  There is nothing performance
critical here, and the system must be able to deal with a device with
no IPv6 addresses anyway (think of the case when the device was up before
ipv6.ko was loaded).  I'm yet to hear a compelling reason.  Anyway,
that's enough ranting from me and here is a patch for your bug.

[IPV6]: Don't hold extra ref count in ipv6_ifa_notify

Currently the logic in ipv6_ifa_notify is to hold an extra reference
count for addrconf dst's that get added to the routing table.  Thus,
when addrconf dst entries are taken out of the routing table, we need
to drop that dst.  However, addrconf dst entries may be removed from
the routing table by means other than __ipv6_ifa_notify.

So we're faced with the choice of either fixing up all places where
addrconf dst entries are removed, or dropping the extra reference count
altogether.

I chose the latter because the ifp itself always holds a dst reference
count of 1 while it's alive.  This is dropped just before we kfree the
ifp object.  Therefore we know that in __ipv6_ifa_notify we will always
hold that count.

This bug was found by Eric W. Biederman.

Signed-off-by: Herbert Xu <[EMAIL PROTECTED]>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3321,9 +3321,7 @@ static void __ipv6_ifa_notify(int event,

switch (event) {
case RTM_NEWADDR:
-   dst_hold(&ifp->rt->u.dst);
-   if (ip6_ins_rt(ifp->rt, NULL, NULL, NULL))
-   dst_release(&ifp->rt->u.dst);
+   ip6_ins_rt(ifp->rt, NULL, NULL, NULL);
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);
break;
@@ -3334,8 +3332,6 @@ static void __ipv6_ifa_notify(int event,
dst_hold(&ifp->rt->u.dst);
if (ip6_del_rt(ifp->rt, NULL, NULL, NULL))
dst_free(&ifp->rt->u.dst);
-   else
-   dst_release(&ifp->rt->u.dst);
break;
}
 }

Re: How can I get local copy of your git tree?

2006-02-02 Thread Jochen Friedrich

Hi Denis,

>Hi John,
># cg-clone 
>rsync://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git 
>wireless-2.6.git
>  
>

the branch softmac has been renamed to softmac-all.

# cg-clone 
"rsync://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git#softmac-all"
 wireless-2.6.git

Thanks,
Jochen

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: How can I get local copy of your git tree?

2006-02-02 Thread Francois Romieu

Denis Vlasenko <[EMAIL PROTECTED]> :
[...]

git clone 
git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git 
some_dest_directory

Say I have some reference tree:

$ ls linux/linux-2.6-ref/.git
branches FETCH_HEADHEAD   info ORIG_HEAD  remotes
description  git-daemon-export-ok  index  objects  refs

$ cd $some_place
$ git clone -s /home/romieu/linux/linux-2.6-ref/.git linux-2.6-unstable
$ cd linux-2.6-unstable
$ cat > .git/remotes/wireless-2.6
$ cat .git/remotes/wireless-2.6

How can I get local copy of your git tree?

2006-02-02 Thread Denis Vlasenko

Hi John,

I want to work with your git tree, specifically, softmac branch.

After reading thru git and codito docs, I did:

# cg-clone 
rsync://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git 
wireless-2.6.git
defaulting to local storage area
WARNING: The rsync access method is DEPRECATED and will be REMOVED in the 
future!
Fetching head...
[snip]
Cloned to wireless-2.6.git/ (origin 
rsync://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git 
available as branch "origin")

# cg-branch-add r-softmac 
'rsync://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git#softmac'

# cg-fetch r-softmac
WARNING: The rsync access method is DEPRECATED and will be REMOVED in the 
future!
Fetching head...
MOTD:
MOTD:   Welcome to the Linux Kernel Archive.
MOTD:
MOTD:   Due to U.S. Exports Regulations, all cryptographic software on this
MOTD:   site is subject to the following legal notice:
MOTD:
MOTD:   This site includes publicly available encryption source code
MOTD:   which, together with object code resulting from the compiling of
MOTD:   publicly available source code, may be exported from the United
MOTD:   States under License Exception "TSU" pursuant to 15 C.F.R. Section
MOTD:   740.13(e).
MOTD:
MOTD:   This legal notice applies to cryptographic software only.
MOTD:   Please see the Bureau of Industry and Security,
MOTD:   http://www.bis.doc.gov/ for more information about current
MOTD:   U.S. regulations.
MOTD:

receiving file list ... done

sent 4 bytes  received 9 bytes  0.43 bytes/sec
total size is 0  speedup is 0.00
cg-fetch: unable to get the head pointer of branch softmac


I must be doing something wrong. Cluebat, anyone?
--
vda
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

97 matches

Mail list logo