[RFC/PATCH 3/3] UDP memory usage accounting (take 2): measurement

2007-09-28 Thread Satoshi OSHIMA
This patch introduces memory usage measurement for UDP.

These 3 points were updated.

- UDP specific codes in IP layer were removed.

- atomic_sub() in a loop was removed

- accounting during socket destruction

signed-off-by: Satoshi Oshima [EMAIL PROTECTED]

signed-off-by: Hideo Aoki [EMAIL PROTECTED]

Index: 2.6.23-rc8-udp_limit/net/ipv4/ip_output.c

===

--- 2.6.23-rc8-udp_limit.orig/net/ipv4/ip_output.c

+++ 2.6.23-rc8-udp_limit/net/ipv4/ip_output.c

@@ -743,6 +743,8 @@ static inline int ip_ufo_append_data(str

 /* specify the length of each IP datagram fragment*/

 skb_shinfo(skb)-gso_size = mtu - fragheaderlen;

 skb_shinfo(skb)-gso_type = SKB_GSO_UDP;

+atomic_add(sk_datagram_pages(skb-truesize),

+   sk-sk_prot-memory_allocated);

 __skb_queue_tail(sk-sk_write_queue, skb);

 

 return 0;

@@ -924,6 +926,9 @@ alloc_new_skb:

 }

 if (skb == NULL)

 goto error;

+if (sk-sk_prot-memory_allocated)

+atomic_add(sk_datagram_pages(skb-truesize),

+   sk-sk_prot-memory_allocated);

 

 /*

  *Fill in the control structures

@@ -1023,6 +1028,8 @@ alloc_new_skb:

 frag = skb_shinfo(skb)-frags[i];

 skb-truesize += PAGE_SIZE;

 atomic_add(PAGE_SIZE, sk-sk_wmem_alloc);

+if (sk-sk_prot-memory_allocated)

+atomic_inc(sk-sk_prot-memory_allocated);

 } else {

 err = -EMSGSIZE;

 goto error;

@@ -1123,7 +1130,9 @@ ssize_tip_append_page(struct sock *sk, 

 if (unlikely(!skb)) {

 err = -ENOBUFS;

 goto error;

-}

+} else if (sk-sk_prot-memory_allocated)

+atomic_add(sk_datagram_pages(skb-truesize),

+   sk-sk_prot-memory_allocated);

 

 /*

  *Fill in the control structures

@@ -1152,6 +1161,8 @@ ssize_tip_append_page(struct sock *sk, 

 /*

  * Put the packet on the pending queue.

  */

+atomic_add(sk_datagram_pages(skb-truesize),

+   sk-sk_prot-memory_allocated);

 __skb_queue_tail(sk-sk_write_queue, skb);

 continue;

 }

@@ -1202,13 +1213,14 @@ int ip_push_pending_frames(struct sock *

 struct iphdr *iph;

 __be16 df = 0;

 __u8 ttl;

-int err = 0;

+int err = 0, send_page_size;

 

 if ((skb = __skb_dequeue(sk-sk_write_queue)) == NULL)

 goto out;

 tail_skb = (skb_shinfo(skb)-frag_list);

 

 /* move skb-data to ip header from ext header */

+send_page_size = sk_datagram_pages(skb-truesize);

 if (skb-data  skb_network_header(skb))

 __skb_pull(skb, skb_network_offset(skb));

 while ((tmp_skb = __skb_dequeue(sk-sk_write_queue)) != NULL) {

@@ -1218,6 +1230,7 @@ int ip_push_pending_frames(struct sock *

 skb-len += tmp_skb-len;

 skb-data_len += tmp_skb-len;

 skb-truesize += tmp_skb-truesize;

+send_page_size += sk_datagram_pages(tmp_skb-truesize);

 __sock_put(tmp_skb-sk);

 tmp_skb-destructor = NULL;

 tmp_skb-sk = NULL;

@@ -1269,6 +1282,8 @@ int ip_push_pending_frames(struct sock *

 /* Netfilter gets whole the not fragmented skb. */

 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,

   skb-dst-dev, dst_output);

+if (sk-sk_prot-memory_allocated)

+atomic_sub(send_page_size, sk-sk_prot-memory_allocated);

 if (err) {

 if (err  0)

 err = inet-recverr ? net_xmit_errno(err) : 0;

@@ -1298,9 +1313,15 @@ void ip_flush_pending_frames(struct sock

 {

 struct inet_sock *inet = inet_sk(sk);

 struct sk_buff *skb;

+int num_flush_mem = 0;

 

-while ((skb = __skb_dequeue_tail(sk-sk_write_queue)) != NULL)

+while ((skb = __skb_dequeue_tail(sk-sk_write_queue)) != NULL) {

+num_flush_mem += sk_datagram_pages(skb-truesize);

 kfree_skb(skb);

+}

+

+if (sk-sk_prot-memory_allocated)

+atomic_sub(num_flush_mem, sk-sk_prot-memory_allocated);

 

 inet-cork.flags = ~IPCORK_OPT;

 kfree(inet-cork.opt);

Index: 2.6.23-rc8-udp_limit/net/ipv4/udp.c

===

--- 2.6.23-rc8-udp_limit.orig/net/ipv4/udp.c

+++ 2.6.23-rc8-udp_limit/net/ipv4/udp.c

@@ -887,6 +887,9 @@ try_again:

 err = ulen;

 

 out_free:

+atomic_sub(sk_datagram_pages(skb-truesize),

+   sk-sk_prot-memory_allocated);

+

 skb_free_datagram(sk, skb);

 out:

 return err;

@@ -894,6 +897,9 @@ out:

 csum_copy_err:

 UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite);

 

+atomic_sub(sk_datagram_pages(skb-truesize),

+   

Upgradeing 2.6.21.7-2.6.22.9 kill my network (sky2): sky2 eth0: rx error, status 0x402300 length 60

2007-09-28 Thread Krzysztof Oledzki

Hello,

After upgrading my kernel from 2.6.21.7 to 2.6.22.9 my 88E8053 no longer 
works:


sky2 :02:00.0: v1.14 addr 0xcfffc000 irq 17 Yukon-EC (0xb6) rev 1
sky2 eth0: addr 00:11:d8:50:f6:28
sky2 eth0: enabling interface
sky2 eth0: ram buffer 48K
sky2 eth0: Link is up at 100 Mbps, full duplex, flow control both
sky2 eth0: rx error, status 0x402300 length 60
sky2 eth0: rx error, status 0x402500 length 60
sky2 eth0: rx error, status 0x402300 length 60
sky2 eth0: rx error, status 0x402500 length 60
sky2 eth0: rx error, status 0x402300 length 60
sky2 eth0: rx error, status 0x402500 length 60
sky2 eth0: rx error, status 0x402300 length 60
sky2 eth0: rx error, status 0x402300 length 60
sky2 eth0: rx error, status 0x402500 length 60
sky2 eth0: rx error, status 0x402300 length 60
sky2 eth0: rx error, status 0x402500 length 60
sky2 eth0: rx error, status 0x402500 length 60
sky2 eth0: rx error, status 0x402500 length 60
sky2 eth0: rx error, status 0x402500 length 60
(...)

I also compared lspci output from both 2.6.21/2.6.22 and it is the same:

02:00.0 Ethernet controller [0200]: Marvell Technology Group Ltd. 88E8053 PCI-E 
Gigabit Ethernet Controller [11ab:4362] (rev 15)
Subsystem: ASUSTeK Computer Inc. Marvell 88E8053 Gigabit Ethernet 
controller PCIe (Asus) [1043:8142]
Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- 
Stepping- SERR- FastB2B-
Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- 
MAbort- SERR- PERR-
Latency: 0, Cache Line Size: 16 bytes
Interrupt: pin A routed to IRQ 221
Region 0: Memory at cfffc000 (64-bit, non-prefetchable) [size=16K]
Region 2: I/O ports at d800 [size=256]
Expansion ROM at cffc [disabled] [size=128K]
Capabilities: [48] Power Management version 2
Flags: PMEClk- DSI- D1+ D2+ AuxCurrent=0mA 
PME(D0+,D1+,D2+,D3hot+,D3cold+)
Status: D0 PME-Enable- DSel=0 DScale=1 PME-
Capabilities: [50] Vital Product Data
Capabilities: [5c] Message Signalled Interrupts: Mask- 64bit+ Queue=0/1 
Enable+
Address: fee0300c  Data: 41c9
Capabilities: [e0] Express Legacy Endpoint IRQ 0
Device: Supported: MaxPayload 128 bytes, PhantFunc 0, ExtTag-
Device: Latency L0s unlimited, L1 unlimited
Device: AtnBtn- AtnInd- PwrInd-
Device: Errors: Correctable- Non-Fatal- Fatal- Unsupported-
Device: RlxdOrd- ExtTag- PhantFunc- AuxPwr+ NoSnoop-
Device: MaxPayload 128 bytes, MaxReadReq 512 bytes
Link: Supported Speed 2.5Gb/s, Width x1, ASPM L0s, Port 0
Link: Latency L0s 256ns, L1 unlimited
Link: ASPM Disabled RCB 128 bytes CommClk- ExtSynch-
Link: Speed 2.5Gb/s, Width x1
00: ab 11 62 43 07 04 10 00 15 00 00 02 04 00 00 00
10: 04 c0 ff cf 00 00 00 00 01 d8 00 00 00 00 00 00
20: 00 00 00 00 00 00 00 00 00 00 00 00 43 10 42 81
30: 00 00 fc cf 48 00 00 00 00 00 00 00 0a 01 00 00
40: 00 00 f0 01 00 80 a0 01 01 50 02 fe 00 20 00 14
50: 03 5c 00 80 00 00 00 01 00 00 00 01 05 e0 83 00
60: 0c 30 e0 fe 00 00 00 00 c9 41 00 00 00 00 00 00
70: 00 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00
80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
90: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
a0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
c0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
d0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
e0: 10 00 11 00 c0 0f 00 00 00 24 1b 00 11 a4 03 00
f0: 08 00 11 10 00 00 00 00 00 00 00 00 00 00 00 00


It is quite strange as on the other similar system (only rev 1/2 
difference), sky2 driver from this 2.6.22 kernel solved my problem 
(network hangs):


sky2 :03:00.0: v1.14 addr 0xf100 irq 16 Yukon-EC (0xb6) rev 2
sky2 eth0: addr 00:16:e6:5f:64:24
sky2 eth0: enabling interface
sky2 eth0: ram buffer 48K
sky2 eth0: Link is up at 1000 Mbps, full duplex, flow control both

Best regards,

Krzysztof Olędzki

[RFC/PATCH 2/3] UDP memory usage accounting: accounting unit and variable

2007-09-28 Thread Satoshi OSHIMA
This patch introduces global variable for UDP memory accounting.
The unit is page.


signed-off-by: Satoshi Oshima [EMAIL PROTECTED]
signed-off-by: Hideo Aoki [EMAIL PROTECTED]

Index: 2.6.23-rc3-udp_limit/include/net/sock.h
===
--- 2.6.23-rc3-udp_limit.orig/include/net/sock.h
+++ 2.6.23-rc3-udp_limit/include/net/sock.h
@@ -723,6 +723,13 @@ static inline int sk_stream_wmem_schedul
   sk_stream_mem_schedule(sk, size, 0);
 }
 
+#define SK_DATAGRAM_MEM_QUANTUM ((int)PAGE_SIZE)
+
+static inline int sk_datagram_pages(int amt)
+{
+   return DIV_ROUND_UP(amt, SK_DATAGRAM_MEM_QUANTUM);
+}
+
 /* Used by processes to lock a socket state, so that
  * interrupts and bottom half handlers won't change it
  * from under us. It essentially blocks any incoming
Index: 2.6.23-rc3-udp_limit/include/net/udp.h
===
--- 2.6.23-rc3-udp_limit.orig/include/net/udp.h
+++ 2.6.23-rc3-udp_limit/include/net/udp.h
@@ -65,6 +65,8 @@ extern rwlock_t udp_hash_lock;
 
 extern struct proto udp_prot;
 
+extern atomic_t udp_memory_allocated;
+
 struct sk_buff;
 
 /*
Index: 2.6.23-rc3-udp_limit/net/ipv4/proc.c
===
--- 2.6.23-rc3-udp_limit.orig/net/ipv4/proc.c
+++ 2.6.23-rc3-udp_limit/net/ipv4/proc.c
@@ -66,7 +66,8 @@ static int sockstat_seq_show(struct seq_
   fold_prot_inuse(tcp_prot), atomic_read(tcp_orphan_count),
   tcp_death_row.tw_count, atomic_read(tcp_sockets_allocated),
   atomic_read(tcp_memory_allocated));
-   seq_printf(seq, UDP: inuse %d\n, fold_prot_inuse(udp_prot));
+   seq_printf(seq, UDP: inuse %d mem %d\n, fold_prot_inuse(udp_prot),
+  atomic_read(udp_memory_allocated));
seq_printf(seq, UDPLITE: inuse %d\n, fold_prot_inuse(udplite_prot));
seq_printf(seq, RAW: inuse %d\n, fold_prot_inuse(raw_prot));
seq_printf(seq,  FRAG: inuse %d memory %d\n, ip_frag_nqueues,
Index: 2.6.23-rc3-udp_limit/net/ipv4/udp.c
===
--- 2.6.23-rc3-udp_limit.orig/net/ipv4/udp.c
+++ 2.6.23-rc3-udp_limit/net/ipv4/udp.c
@@ -113,6 +113,10 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
 DEFINE_RWLOCK(udp_hash_lock);
 
+atomic_t udp_memory_allocated;
+
+EXPORT_SYMBOL(udp_memory_allocated);
+
 static int udp_port_rover;
 
 static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head 
udptable[])

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC/PATCH 0/3] UDP memory usage accounting

2007-09-28 Thread Satoshi OSHIMA
Hi,

Thank you for your comment.


Evgeniy Polyakov wrote:
 Hi.

 On Fri, Sep 21, 2007 at 09:18:07PM +0900, Satoshi OSHIMA
([EMAIL PROTECTED]) wrote:
 This patch set try to introduce memory usage accounting for
 UDP(currently ipv4 only).

 Currently, memory usage of UDP can be observed as the sam of
 usage of tx_queue and rx_queue. But I believe that the system
 wide accounting is usefull when heavy loaded condition.

 In the next step, I would like to add memory usage quota
 for UDP to avoid unlimited memory consumption problem
 under DDOS attack.

 Could you please desribed such attack in more details?
 Each UDP socket has its queue length which can not be exceeded
 (roughly), no new sockets are created when remote side sends a packet
 (like after special steps in TCP), so where is possibility to eat all
 the mem?

For example, sk_buff is put on the slab and
slab can be acquired only from ZONE_NORMAL in i386.

In such case, from 300 to 500MB memory consumption will
be fatal. Users can easily open 1000 sockets per process
under default ulimit. If such sockets hold messages but
user processes don't receive it. Almost all slab will
be occupied by sk_buff.


 This patch set is for 2.6.23-rc7.

 I seriously doubt you want to put udp specific hacks and zillions of
 atomic ops all around the code just to know exact number of bytes eaten
 for UDP.
 Please use udp specific code (like udp_sendmsg()) for proper accounting
 if you need that, but not hacks in generic ip code.

I couldn't find the way to account UDP memory consumption
in UDP layer.

In receive path, accounting can be done in UDP layer
because sk_buff is marked for UDP in UDP layer and it is
released in UDP layer.

In send path, sk_buff is aquired in IP layer and also
released in IP layer. Especially, there is a possibility
 of appending data to the preceding sk_buff in send
queue.

On the other hand, I agree that UDP specific code
in IP layer is not preferable. So I generalize UDP
specific code in IP layer in take 2.

Could you take a look at my take 2 patch set?

Satoshi Oshima

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC/PATCH 1/3] UDP memory usage accounting (take 2): fix send buffer check

2007-09-28 Thread Satoshi OSHIMA
This patch introduces sndbuf size check before

memory allcation for send buffer.

signed-off-by: Satoshi Oshima [EMAIL PROTECTED]

signed-off-by: Hideo Aoki [EMAIL PROTECTED]

Index: 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c

===

--- 2.6.23-rc7-udp_limit.orig/net/ipv4/ip_output.c

+++ 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c

@@ -1004,6 +1004,11 @@ alloc_new_skb:

 frag = skb_shinfo(skb)-frags[i];

 }

 } else if (i  MAX_SKB_FRAGS) {

+if (atomic_read(sk-sk_wmem_alloc) + PAGE_SIZE

+ 2 * sk-sk_sndbuf) {

+err = -ENOBUFS;

+goto error;

+}

 if (copy  PAGE_SIZE)

 copy = PAGE_SIZE;

 page = alloc_pages(sk-sk_allocation, 0);

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] mv643xx_eth: Check ETH_INT_CAUSE_STATE bit

2007-09-28 Thread Dale Farnsworth
Commit 468d09f8946d40228c56de26fe4874b2f98067ed masked the state
interrupt (bit 20 of the cause register). This results in Radstone's
PPC7D repeatedly re-entering the interrupt routine, locking up the
board. The following patch returns the required handling for this
interrupt. 

Signed-off-by: Martyn Welch [EMAIL PROTECTED]
Signed-off-by: Dale Farnsworth [EMAIL PROTECTED]

---
Jeff, this is a bug fix.

 drivers/net/mv643xx_eth.c |2 +-
 drivers/net/mv643xx_eth.h |4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 1799eee..1785d15 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -534,7 +534,7 @@ static irqreturn_t mv643xx_eth_int_handler(int irq, void 
*dev_id)
}
 
/* PHY status changed */
-   if (eth_int_cause_ext  ETH_INT_CAUSE_PHY) {
+   if (eth_int_cause_ext  (ETH_INT_CAUSE_PHY | ETH_INT_CAUSE_STATE)) {
struct ethtool_cmd cmd;
 
if (mii_link_ok(mp-mii)) {
diff --git a/drivers/net/mv643xx_eth.h b/drivers/net/mv643xx_eth.h
index 82f8c0c..565b966 100644
--- a/drivers/net/mv643xx_eth.h
+++ b/drivers/net/mv643xx_eth.h
@@ -64,7 +64,9 @@
 #define ETH_INT_CAUSE_TX_ERROR (ETH_TX_QUEUES_ENABLED  8)
 #define ETH_INT_CAUSE_TX   (ETH_INT_CAUSE_TX_DONE | ETH_INT_CAUSE_TX_ERROR)
 #define ETH_INT_CAUSE_PHY  0x0001
-#define ETH_INT_UNMASK_ALL_EXT (ETH_INT_CAUSE_TX | ETH_INT_CAUSE_PHY)
+#define ETH_INT_CAUSE_STATE0x0010
+#define ETH_INT_UNMASK_ALL_EXT (ETH_INT_CAUSE_TX | ETH_INT_CAUSE_PHY | \
+   ETH_INT_CAUSE_STATE)
 
 #define ETH_INT_MASK_ALL   0x
 #define ETH_INT_MASK_ALL_EXT   0x

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23-rc8-mm2 - tcp_fastretrans_alert() WARNING

2007-09-28 Thread Cedric Le Goater
Hello ! 

Andrew Morton wrote:
 ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.23-rc8/2.6.23-rc8-mm2/

I just found that warning in my logs. It seems that it's been 
happening since rc7-mm1 at least. 

Thanks !

C.

WARNING: at /home/legoater/linux/2.6.23-rc8-mm2/net/ipv4/tcp_input.c:2314 
tcp_fastretrans_alert()

Call Trace:
 IRQ  [8040fdc3] tcp_ack+0xcd6/0x1894
 [80411c79] tcp_data_queue+0x5be/0xae7
 [80412b54] tcp_rcv_established+0x61f/0x6df
 [80254146] __lock_acquire+0x8a1/0xf1b
 [80419cfd] tcp_v4_do_rcv+0x3e/0x394
 [8041a66f] tcp_v4_rcv+0x61c/0x9a9
 [803ff1e3] ip_local_deliver+0x1da/0x2a4
 [803ffb4e] ip_rcv+0x583/0x5c9
 [8046d33f] packet_rcv_spkt+0x19a/0x1a8
 [803e081c] netif_receive_skb+0x2cf/0x2f5
 [88042505] :tg3:tg3_poll+0x65d/0x8a4
 [803e09e8] net_rx_action+0xb8/0x191
 [8023a927] __do_softirq+0x5f/0xe0
 [8020c98c] call_softirq+0x1c/0x28
 [8020e9c3] do_softirq+0x3b/0xb8
 [8023aa1e] irq_exit+0x4e/0x50
 [8020e7df] do_IRQ+0xbd/0xd7
 [80209cb9] mwait_idle+0x0/0x4d
 [8020bce6] ret_from_intr+0x0/0xf
 EOI  [80209cfc] mwait_idle+0x43/0x4d
 [802099fb] enter_idle+0x22/0x24
 [80209c4f] cpu_idle+0x9d/0xc0
 [80476a91] rest_init+0x55/0x57
 [80630815] start_kernel+0x2d6/0x2e2
 [80630134] _sinittext+0x134/0x13b
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Fwd: [PATCH#2 3/4] [PPC] Compile fix for 8xx CPM Ehernet driver

2007-09-28 Thread Jeff Garzik

Kumar Gala wrote:

Begin forwarded message:


From: Jochen Friedrich [EMAIL PROTECTED]
Date: September 24, 2007 12:15:35 PM CDT
To: [EMAIL PROTECTED]
Cc: [EMAIL PROTECTED], Marcelo Tosatti [EMAIL PROTECTED]
Subject: [PATCH#2 3/4] [PPC] Compile fix for 8xx CPM Ehernet driver


Jeff,

Please pick up for 2.6.23 if you don't mind.


Please send an apply-able patch...

Jeff, off to bed



-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23-rc8: cannot make netconsole work

2007-09-28 Thread Matt Mackall
On Fri, Sep 28, 2007 at 01:27:55PM +0400, Andrey Borzenkov wrote:
 I finally decided to try netconsole in attempt to get some more information 
 why my system does not resume (but that is different story). But I cannot 
 make it work - it does load but I see no traffic flowing ever. This is 
 notebook with e100 driver and it works just fine for normal traffic. 
 netconsole is currently compiled as module, loading it I get:
 
 sudo modprobe netconsole netconsole=@/eth0,@/ 
 
 [ 2395.838094] netconsole: local port 6665
 [ 2395.838126] netconsole: interface eth0
 [ 2395.838132] netconsole: remote port 
 [ 2395.838140] netconsole: remote IP 0.0.0.0
 [ 2395.838150] netconsole: remote ethernet address ff:ff:ff:ff:ff:ff
 [ 2395.838175] netconsole: local IP 192.168.1.8
 [ 2395.843798] console [netcon0] enabled
 [ 2395.844722] netconsole: network logging started
 
 but I cannot see any packet flowing via eth0 not can I catch anything on 
 remote side using netcat. This does not work when I add explicitly source and 
 target addresses either:
 
 pts/0}% sudo modprobe netconsole [EMAIL PROTECTED]/eth0,@192.168.1.5/
 {pts/0}% dmesg | tail
 [ 2427.128947] device eth0 left promiscuous mode
 [ 2427.129005] audit(1190970818.972:5): dev=eth0 prom=0 old_prom=256 
 auid=4294967295
 [ 2541.831721] netconsole: local port 6665
 [ 2541.831756] netconsole: local IP 192.168.1.8
 [ 2541.831763] netconsole: interface eth0
 [ 2541.831769] netconsole: remote port 
 [ 2541.831776] netconsole: remote IP 192.168.1.5
 [ 2541.831787] netconsole: remote ethernet address ff:ff:ff:ff:ff:ff
 [ 2541.838832] console [netcon0] enabled
 [ 2541.839964] netconsole: network logging started

What is your console log level set to? If the messages don't come out
on the local console, they won't get sent out the network either.
Fedora at least defaults to hiding most messages. Adding 'debug' to
your kernel command line will change that.

-- 
Mathematics is the supreme nostalgia of our time.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Upgrading 2.6.21.7-2.6.22.9 kills my network (sky2): sky2 eth0: rx error, status 0x402300 length 60

2007-09-28 Thread Krzysztof Oledzki



On Fri, 28 Sep 2007, Krzysztof Oledzki wrote:




On Fri, 28 Sep 2007, Krzysztof Oledzki wrote:


Hello,

After upgrading my kernel from 2.6.21.7 to 2.6.22.9 my 88E8053 no longer 
works:


Small update: 2.6.22.9 with sky2.c/sky2.h from 2.4.22.4 works without any 
problems.


Final update.

Reverting this patch: 
http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.22.y.git;a=commitdiff_plain;h=8c07a8e30ba8a2e0831da4b134202598435f8358

solved my problem.

I also found this one:

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=d6532232cd3de79c852685823a9c52f723816d0a

Could it go to a next -stable ASAP, please? It seems that 
2.6.22.5-2.6.22.9 kernels have broken sky2 if used with vlans. :( Such 
regression in a -stable kernel isn't nice. :(


Best regards,

Krzysztof Olędzki

Re: [RFC/PATCH 2/3] UDP memory usage accounting: accounting unit and variable

2007-09-28 Thread Satoshi OSHIMA
Hi,

Thank you for your comment.

Andi Kleen wrote:
 Satoshi OSHIMA [EMAIL PROTECTED] writes:

 This patch introduces global variable for UDP memory accounting.
 The unit is page.

 The global variable doesn't seem to be very MP scalable, especially
 if you change it for each packet. This will be a very hot cache line,
 in the worst case bouncing around a large machine.

I understand what you pointed out. But I think the accounting
method I'm proposing is very similar to TCP accounting and per
socket accounting.
How do you think of it?


 Possible alternatives:
 - Per CPU variables

I'm afraid that sockets and socket buffers are handled on
various CPUs. I mean that socket creation might be done
on CPU-A but socket receiving might be done on CPU-B.

And per CPU variables must be counted up when socket
cap is checked. I'm afraid that per CPU vaiables are
also costly enough.


 - You only change the global on socket creation time (by pre
allocating a large
 amount) or when the system comes under memory pressure.
 - Batching of the global updates for multiple packets [that's a variant
 of the previous one, might be still too costly though]

 Also for such variables it's usually good to cache line pad them on SMP
 to avoid false sharing with something else.

I believe that memory usage accounting should be done accurately.
Currently I couldn't see how can we know the accurate memory
accounting only when the system is under memory pressure.

But I revised the patch to avoid some atomic operations.

If I could find the good way to avoid atomic operation more,
I will add it.

Satoshi Oshima
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PKT_SCHED]: Add stateless NAT

2007-09-28 Thread Patrick McHardy
David Miller wrote:
 From: jamal [EMAIL PROTECTED]
 Date: Thu, 27 Sep 2007 08:39:45 -0400
 
+config NET_ACT_NAT
+tristate Stateless NAT
+depends on NET_CLS_ACT
+select NETFILTER

I am gonna have to agree with Evgeniy on this Herbert;-
The rewards are it will improve performance for people who dont need
netfilter.
 
 
 I agree that we should move the functions out.   However...
 
 You have to realize that this logic is a complete crock
 of shit for %99. of Linux users out there
 who get and only use distribution compiled kernels which are
 going to enable everything anyways.
 
 So we better make sure there are zero performance implications at some
 point just for compiling netfilter into the tree.  I know that isn't
 the case currently, but that means that we aren't helping out the
 majority of Linux users and are thus only adding these optimizations
 for such a small sliver of users and that is totally pointless and
 sucks.


The hooks themselves actually shouldn't be that much of a problem
since without any registered users they come down to a simple
list_empty check. The bigger problem is probably the fact that
the okfn is usually declared inline, but we also take its address
for nf_hook_slow, so at least with forced inlining, we end up
with one inlined and one out-of-line version.

Looking at ip_input.o as example (everything without forced inlining):

   textdata bss dec hex filename
   2076   8   02084 824 net/ipv4/ip_input.o
   3483   8   03491 da3 net/ipv4/ip_input.o


So we have roughly 1.75 more code for the two hooks. For reference,
without using the function pointer it increases a lot less:

   textdata bss dec hex filename
   2319   8   02327 917 net/ipv4/ip_input.o

similar with not inlining ip_rcv_finish() and ip_local_deliver_finish():

   textdata bss dec hex filename
   2282   8   02290 8f2 net/ipv4/ip_input.o

Any ideas how to do this better? Unforunately we need the function
pointers for reinjecting queued packets ..
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] sky2: fix transmit state on resume

2007-09-28 Thread Stephen Hemminger
After resume, driver has reset the chip so the current state
of transmit checksum offload state machine and DMA state machine
will be undefined.

The fix is to set the state so that first Tx will set MSS and offset
values.

Signed-off-by: Stephen Hemminger [EMAIL PROTECTED]

--- a/drivers/net/sky2.c2007-09-28 09:21:26.0 -0700
+++ b/drivers/net/sky2.c2007-09-28 09:21:37.0 -0700
@@ -831,6 +831,20 @@ static inline struct sky2_tx_le *get_tx_
return le;
 }
 
+static void tx_init(struct sky2_port *sky2)
+{
+   struct sky2_tx_le *le;
+
+   sky2-tx_prod = sky2-tx_cons = 0;
+   sky2-tx_tcpsum = 0;
+   sky2-tx_last_mss = 0;
+
+   le = get_tx_le(sky2);
+   le-addr = 0;
+   le-opcode = OP_ADDR64 | HW_OWNER;
+   sky2-tx_addr64 = 0;
+}
+
 static inline struct tx_ring_info *tx_le_re(struct sky2_port *sky2,
struct sky2_tx_le *le)
 {
@@ -1244,7 +1258,8 @@ static int sky2_up(struct net_device *de
GFP_KERNEL);
if (!sky2-tx_ring)
goto err_out;
-   sky2-tx_prod = sky2-tx_cons = 0;
+
+   tx_init(sky2);
 
sky2-rx_le = pci_alloc_consistent(hw-pdev, RX_LE_BYTES,
   sky2-rx_le_map);

-- 
Stephen Hemminger [EMAIL PROTECTED]

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: e1000 tcp checksum incorrect, x86 64b

2007-09-28 Thread Rick Jones

Herbert Xu wrote:

Jon Smirl [EMAIL PROTECTED] wrote:


App is writing seven bytes to the socket. Socket write timeout expires
and the seven bytes are sent. The checksum is not getting inserted
into the packet. It is set to a constant 0x8389 instead of the right
value.  App is gmpc 0.15.4.95, Revision: 6794

Attached Wireshark packet trace show the problem. e1000 is 192.168.1.4
64bit, Q6600. Dell Dimension 9200



Wireshark is broken.  It needs to know TP_STATUS_CSUMNOTREADY
means that the checksum is partial and will only be completed
when the hardware sends the packet out.

Alternatively disable checksum offload with ethtool.


Or take the packet traces outboard of the NIC somewhere/somehow.

What problem(s) led to your taking the packet trace in the first place?

rick jones
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PKT_SCHED]: Add stateless NAT

2007-09-28 Thread Patrick McHardy
David Miller wrote:
 I still think the nf_*() prefixes should all go and the extern
 prototypes should go into an independant header file.
 
 These are not netfilter routines, they are INET helpers.


Agreed. Evgeniy, can you send a new patch for this?

 And we should make similar treatment for all of the ipv6
 packet parser helper functions that ipv6 netfilter needs.


I'll look into that.

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[IPV6] Fix ICMPv6 redirect handling with target multicast address

2007-09-28 Thread Brian Haley
When the ICMPv6 Target address is multicast, Linux processes the 
redirect instead of dropping it.  The problem is in this code in 
ndisc_redirect_rcv():


if (ipv6_addr_equal(dest, target)) {
on_link = 1;
} else if (!(ipv6_addr_type(target)  IPV6_ADDR_LINKLOCAL)) {
ND_PRINTK2(KERN_WARNING
   ICMPv6 Redirect: target address is not 
link-local.\n);

return;
}

This second check will succeed if the Target address is, for example, 
FF02::1 because it has link-local scope.  Instead, it should be checking 
if it's a unicast link-local address, as stated in RFC 2461/4861 Section 
8.1:


  - The ICMP Target Address is either a link-local address (when
redirected to a router) or the same as the ICMP Destination
Address (when redirected to the on-link destination).

I know this doesn't explicitly say unicast link-local address, but it's 
implied.


This bug is preventing Linux kernels from achieving IPv6 Logo Phase II 
certification because of a recent error that was found in the TAHI test 
suite - Neighbor Disovery suite test 206 (v6LC.2.3.6_G) had the 
multicast address in the Destination field instead of Target field, so 
we were passing the test.  This won't be the case anymore.


The patch below fixes this problem, and also fixes ndisc_send_redirect() 
to not send an invalid redirect with a multicast address in the Target 
field.  I re-ran the TAHI Neighbor Discovery section to make sure Linux 
passes all 245 tests now.


-Brian


Signed-off-by: Brian Haley [EMAIL PROTECTED]

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 31b3f1b..4f47d29 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -368,6 +368,11 @@ static inline int ipv6_prefix_equal(const struct in6_addr *a1,
    prefixlen);
 }
 
+static inline int ipv6_addr_linklocal(const struct in6_addr *a)
+{
+	return ((a-s6_addr32[0]  htonl(0xFFC0)) == htonl(0xFE80));
+}
+
 static inline int ipv6_addr_any(const struct in6_addr *a)
 {
 	return ((a-s6_addr32[0] | a-s6_addr32[1] | 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 74c4d8d..8f953a7 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1267,7 +1267,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 
 	if (ipv6_addr_equal(dest, target)) {
 		on_link = 1;
-	} else if (!(ipv6_addr_type(target)  IPV6_ADDR_LINKLOCAL)) {
+	} else if (!ipv6_addr_linklocal(target)) {
 		ND_PRINTK2(KERN_WARNING
 			   ICMPv6 Redirect: target address is not link-local.\n);
 		return;
@@ -1343,7 +1343,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 	}
 
 	if (!ipv6_addr_equal(ipv6_hdr(skb)-daddr, target) 
-	!(ipv6_addr_type(target)  IPV6_ADDR_LINKLOCAL)) {
+	!ipv6_addr_linklocal(target)) {
 		ND_PRINTK2(KERN_WARNING
 			ICMPv6 Redirect: target address is not link-local.\n);
 		return;


Re: [PATCH] net: Add network namespace clone unshare support.

2007-09-28 Thread Cedric Le Goater
Andrew Morton wrote:
 On Fri, 28 Sep 2007 11:12:13 +0200 Cedric Le Goater [EMAIL PROTECTED] wrote:
 
 Cedric made a good point that we will have conflicts of code
 being added to the same place in nsproxy.c and the like.  So
 I copied Andrew to give him a heads up.
 here's a suggestion,

 we could keep the net namespace unshare patch out of david's tree,
 let andrew merge and release a new -mm and, then, send the net namespace 
 unshare patch to andrew. that should keep nsproxy out of the andrew's 
 merge challenge. But david's tree will miss the unshare part for a while.
 
 This patch only generates two rejects against the current -mm poop pile.
 That's insignificant.  We don't need to do anything special to merge a
 little patch like this one.

Thanks Andrew.

C.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: e1000 tcp checksum incorrect, x86 64b

2007-09-28 Thread Jon Smirl
On 9/28/07, Rick Jones [EMAIL PROTECTED] wrote:
 Herbert Xu wrote:
  Jon Smirl [EMAIL PROTECTED] wrote:
 
 App is writing seven bytes to the socket. Socket write timeout expires
 and the seven bytes are sent. The checksum is not getting inserted
 into the packet. It is set to a constant 0x8389 instead of the right
 value.  App is gmpc 0.15.4.95, Revision: 6794
 
 Attached Wireshark packet trace show the problem. e1000 is 192.168.1.4
 64bit, Q6600. Dell Dimension 9200
 
 
  Wireshark is broken.  It needs to know TP_STATUS_CSUMNOTREADY
  means that the checksum is partial and will only be completed
  when the hardware sends the packet out.
 
  Alternatively disable checksum offload with ethtool.

 Or take the packet traces outboard of the NIC somewhere/somehow.

 What problem(s) led to your taking the packet trace in the first place?

I was working on the Ethernet driver for another embedded system
attached to the box. I noticed these errors going to my NSLU2 which I
thought was working ok. But now I know they are not real errors.

Ethernet driver for MPC5200 in embedded system still has some
problems. It is being discussed on linuxppc-embedded.

-- 
Jon Smirl
[EMAIL PROTECTED]
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] sb1250-mac: Driver model phylib update

2007-09-28 Thread Maciej W. Rozycki
On Mon, 24 Sep 2007, Andrew Morton wrote:

   Well, this is against Jeff's netdev-2.6 tree which hopefully is not as 
  crufty as Linus's old mainline; if it is not possible to queue this change 
  for 2.6.25 or suchlike, then I will try to resubmit later.
 
 Most of Jeff's netdev tree got dumped into Dave's net-2.6.24 tree.  That's
 the one you want to be raising patches against for the next few weeks.

 OK, thanks for clarification.  Then both patches already submitted:

patch-netdev-2.6.23-rc6-20070920-sb1250-mac-typedef-9
patch-netdev-2.6.23-rc6-20070920-sb1250-mac-29

apply cleanly to net-2.6.24 one on top of the other in this order.

 I can resubmit them -- where?  netdev?  As I say I am fine with 2.6.25 as 
the target.

  Maciej
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Upgradeing 2.6.21.7-2.6.22.9 kill my network (sky2): sky2 eth0: rx error, status 0x402300 length 60

2007-09-28 Thread Krzysztof Oledzki



On Fri, 28 Sep 2007, Krzysztof Oledzki wrote:


Hello,

After upgrading my kernel from 2.6.21.7 to 2.6.22.9 my 88E8053 no longer 
works:


Small update: 2.6.22.9 with sky2.c/sky2.h from 2.4.22.4 works without any 
problems.


Best regards,


Krzysztof Olędzki

Re: [PATCH] net: Add network namespace clone unshare support.

2007-09-28 Thread Andrew Morton
On Fri, 28 Sep 2007 11:12:13 +0200 Cedric Le Goater [EMAIL PROTECTED] wrote:

  Cedric made a good point that we will have conflicts of code
  being added to the same place in nsproxy.c and the like.  So
  I copied Andrew to give him a heads up.
 
 here's a suggestion,
 
 we could keep the net namespace unshare patch out of david's tree,
 let andrew merge and release a new -mm and, then, send the net namespace 
 unshare patch to andrew. that should keep nsproxy out of the andrew's 
 merge challenge. But david's tree will miss the unshare part for a while.

This patch only generates two rejects against the current -mm poop pile.
That's insignificant.  We don't need to do anything special to merge a
little patch like this one.

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/7] CAN: Add virtual CAN netdevice driver

2007-09-28 Thread Oliver Hartkopp
Eric W. Biederman wrote:

 Currently IFF_LOOPBACK set in dev-flags means we are dealing
 with drivers/net/loopback.c. 
   

This is a very general view, don't you think? The one is an interface
flag and the other one is an interface itself. This looks like a risky
mixture, when there is no clean separation.

 In other networking layers loopback functionality (i.e. for broadcast)
 is never expected to be provided by the drivers and is instead 
 always provided by the networking layer.  Keeping the drivers
 simpler.  Further you already have this functionality in the
 generic CAN layer for doing loopback without driver support.
   

ACK  Yes.

 So at a first glance the CAN usage of IFF_LOOPBACK looks completely
 broken, and likely to confuse other networking layers if they see
 a CAN device.  Say if someone attempts to run IP over CAN or
 something like that.
   

The CAN protocol family is some kind of a closed ecosystem with a
complete different addressing scheme that uses the bare networking
functionality of the Linux Kernel as well as DECNET or ARCNET. You would
never been able to run the IP-stack on a CAN netdev (ARPHDR_CAN,
ETH_P_CAN) due to several technical differences in addressing, etc.

 Do you think you can remove this incompatible usage of IFF_LOOPBACK
 from the can code?
   

Yes. We might pick another name for it (see below).

 If I have read your documentation properly the only reason you are
 doing this is so that the timing of frames to cansniffer more
 accurately reflects when the frame hits the wire.  If CAN runs over a
 very slow medium I guess I can see where that can be a concern.

It's not really a (more accurate) timing problem but a media access
issue that occurs on the CAN bus itself:

The CAN bus is in opposite to an Ethernet a CSMA/CA (collision
*avoidance*) medium with an arbitration that's controlled with the so
called CAN-Identifier. The lower the CAN-Id value the higher is it's
priority. E.g. a CAN frame with an CAN-Id of 0x100 can squeeze out a CAN
frame with a CAN-Id of 0x400 on the bus, so that the CAN frame with the
CAN-Id of 0x400 is to be sent again (_automatically_ by the CAN
controller chip).

An example:
You want to send CAN-Id 0x700 on the bus, and put all the data into your
local CAN controller chip and start the transmission. When your local
controller wants to send it's frame it looses it's arbitration due to a
reception of a CAN-Id 0x100 and generates a RX-interrupt for this
received frame. After that your local controller tries to resend his
frame but it looses it's arbitration again due to a reception of a
CAN-Id 0x400 and generates a RX-interrupt for this received frame.
Finally your local controller tries to resend his frame, has success,
and generates a TX-interrupt for his successfully sent frame.

When you use the network layer loopback functionality the other
applications on your local host would see CAN-Id 0x700, CAN-Id 0x100,
CAN-Id 0x400. When you use the loopback on driver level you would see
the _correct_ message order CAN-Id 0x100, CAN-Id 0x400, CAN-Id 0x700.

So it's not an issue of having better timings but to reproduce the
*correct message order* from the CAN bus. One year ago this problem has
originally been pointed out by Michael Schulze from the University of
Magdeburg as having a correct message order created by the CSMA/CA
treatment is a vital requirement for CAN bus users. As you might see now
the CAN netdriver has to offer additional functionalities due to the
CSMA/CA treatment in opposite to the 'standard' CSMA/CD behaviour you
know from Ethernet netdrivers. And this arbitration information of the
CAN controller is only available on driver level. It is therefore no
question IF the CAN netdriver supports the CSMA/CA treatment but HOW to
provide an interface for this functionality on a basis of a standard
netdriver (which simply only sends and receives frames).

As the CAN netdrivers (as described above) are only available and used
by the PF_CAN core, the use of IFF_LOOPBACK looked like reasonable
solution to distinguish whether the CAN netdriver is capable to do the
loopback (e.g. due to the ability of the controller to generate
TX-interrupts) or not. The usage of IFF_LOOPBACK in CAN netdrivers
didn't affect or confuse the rest of the Linux networking system up to
now. Btw. if you state that IFF_LOOPBACK means for a netdriver, that
all packets from a device will come right back to the current machine,
and go nowhere else., we should think about a new IFF_-flag here.

I don't have any concerns creating a new IFF_-flag for this loopback
approved by CSMA/CA media access i just have no idea for a really good
name for it. But maybe the use of IFF_LOOPBACK for CAN netdrivers
(ARPHRD_CAN) is also ok for you now?!?

Oliver






-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Zero-length write() does not generate a datagram on connected socket

2007-09-28 Thread Michael Kerrisk
On 9/28/07, Stephen Hemminger [EMAIL PROTECTED] wrote:
 On Thu, 27 Sep 2007 13:53:34 -0700 (PDT)
 David Miller [EMAIL PROTECTED] wrote:

  From: Stephen Hemminger [EMAIL PROTECTED]
  Date: Mon, 24 Sep 2007 15:34:35 -0700
 
   The bug http://bugzilla.kernel.org/show_bug.cgi?id=5731
   describes an issue where write() can't be used to generate a zero-length
   datagram (but send, and sendto do work).
  
   I think the following is needed:
  
   --- a/net/socket.c  2007-08-20 09:54:28.0 -0700
   +++ b/net/socket.c  2007-09-24 15:31:25.0 -0700
   @@ -777,8 +777,11 @@ static ssize_t sock_aio_write(struct kio
   if (pos != 0)
   return -ESPIPE;
  
   -   if (iocb-ki_left == 0) /* Match SYS5 behaviour */
   -   return 0;
   +   if (unlikely(iocb-ki_left == 0)) {
   +   struct socket *sock = iocb-ki_filp-private_data;
   +   if (sock-type == SOCK_STREAM)
   +   return 0;
   +   }
  
   x = alloc_sock_iocb(iocb, siocb);
   if (!x)
 
  We should simply remove the check completely.
 
  There is no need to add special code for different types of protocols
  and sockets.
 
  As is hinted in the bugzilla, the exact same thing can happen with a
  suitably constructed sendto() or sendmsg() call.  write() on a socket
  is a sendmsg() with a NULL msg_control and a single entry iovec, plain
  and simple.
 
  It's how BSD and many other systems behave, and I double checked
  Steven's Volume 2 just to make sure.
 
  So I'm going to check in the following to fix this bugzilla.  There is
  a similarly ugly test for len==0 in sys_read() on sockets.  If someone
  would do some research on the validity of that thing I'd really
  appreciate it :-)

 Read of zero length should be a no-op for SOCK_STREAM but
 for SOCK_DATAGRAM or SOCK_SEQPACKET it might be useful as a
 remote wait for event.

Hmm -- I hadn't checked the behavior for zero-length read() on other
systems.  i will try to do that soonish (probably only Minday or so).

Cheers,

Michael
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PKT_SCHED]: Add stateless NAT

2007-09-28 Thread Evgeniy Polyakov
On Thu, Sep 27, 2007 at 12:52:39PM -0700, David Miller ([EMAIL PROTECTED]) 
wrote:
 From: Patrick McHardy [EMAIL PROTECTED]
 Date: Thu, 27 Sep 2007 15:39:34 +0200
 
  Evgeniy Polyakov wrote:
   On Thu, Sep 27, 2007 at 09:20:37PM +0800, Herbert Xu ([EMAIL PROTECTED]) 
   wrote:
   
  How about putting it in net/core/utils.c?
   
   
   I knew, that was a bad idea to try to fix netfilter dependency :)
   
   diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
  
  
  This looks good to me.
 
 I still think the nf_*() prefixes should all go and the extern
 prototypes should go into an independant header file.
 
 These are not netfilter routines, they are INET helpers.
 
 And we should make similar treatment for all of the ipv6
 packet parser helper functions that ipv6 netfilter needs.

Should netfilter still have own nf_ prefixed functions which will just
call the same ones without prefix from inet header?

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ax88796: add 93cx6 eeprom support

2007-09-28 Thread Magnus Damm
Hi Andrew,

Thanks for picking up the patch.

On 9/28/07, Andrew Morton [EMAIL PROTECTED] wrote:
 On Thu, 27 Sep 2007 19:51:19 +0900
 Magnus Damm [EMAIL PROTECTED] wrote:

  ax88796: add 93cx6 eeprom support
 
  This patch hooks up the 93cx6 eeprom code to the ax88796 driver and modifies
  the ax88796 driver to read out the mac address from the eeprom. We need
  this for the ax88796 on certain SuperH boards. The pin configuration used
  to connect the eeprom to the ax88796 on these boards is the same as pointed
  out by the ax88796 datasheet, so we can probably reuse this code for 
  multiple
  platforms in the future.

 I'm showing a minor reject between this and Francois's git-r8169.patch.

[snip]
Oh, sorry about that. I should have built the patch on top of -mm instead.

 You both made the same change to eeprom_93cx6.h.  That all sounds good but
 it would be comforting if you could review each other's work, please...

I thought I was more or less the only user, but thanks for pointing that out.

The eeprom code in the r8169 driver from
2.6.23-rc8-mm2/git-r8169.patch looks fine, but I don't understand the
point of adding the size member to the eeprom struct in
eeprom_93cx6.h. Especially since no code change is made in
eeprom_93cx6.c.

Thanks,

/ magnus
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: /proc/net/ bad hard links count [Was: 2.6.23-rc8-mm2]

2007-09-28 Thread Eric W. Biederman
Jiri Slaby [EMAIL PROTECTED] writes:

 On 09/27/2007 11:22 AM, Andrew Morton wrote:

 ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.23-rc8/2.6.23-rc8-mm2/

Yep.

 # find /proc /dev/null
 find: WARNING: Hard link count is wrong for /proc/net: this may be a bug in 
 your
 filesystem driver.  Automatically turning on find's -noleaf option.  Earlier
 results may have failed to include directories that should have been searched.
 # stat net
   File: `net'
   Size: 0   Blocks: 0  IO Block: 1024   directory
 Device: 3h/3d   Inode: 4026531864  Links: 2
 Access: (0555/dr-xr-xr-x)  Uid: (0/root)   Gid: (0/root)
 Access: 2007-09-28 18:21:24.651209759 +0200
 Modify: 2007-09-28 18:21:24.651209759 +0200
 Change: 2007-09-28 18:21:24.651209759 +0200
 # stat net/
   File: `net/'
   Size: 0   Blocks: 0  IO Block: 1024   directory
 Device: 3h/3d   Inode: 4026531909  Links: 4
 Access: (0555/dr-xr-xr-x)  Uid: (0/root)   Gid: (0/root)
 Access: 2007-09-28 18:26:48.813048220 +0200
 Modify: 2007-09-28 18:26:48.813048220 +0200
 Change: 2007-09-28 18:26:48.813048220 +0200

 hmm, this is some kind of weirdness :)

Yes.

I can explain it. For the network namespace stuff we need special handling
of /proc/net so that depending on the network namespace we are resolving
against you see a different behavior.  So you actually are observing
two different directories, one being a magic invisible symlink to the
other.

Currently I am resolving against current (which has a number of
limitations) and the weird ugly effect you are current seeing.

So it looks like I need to either make /proc/net a symlink to
/proc/self/net or make the network namespace something that we capture
at mount time of /proc.

This was my don't get hung up on this implementation detail version.
Thanks for pointing out it has user visible problems.  I will see
what I can do to resolve this.

Eric
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/7] CAN: Add virtual CAN netdevice driver

2007-09-28 Thread Eric W. Biederman
Oliver Hartkopp [EMAIL PROTECTED] writes:

 Eric W. Biederman wrote:

 Currently IFF_LOOPBACK set in dev-flags means we are dealing
 with drivers/net/loopback.c. 
   

 This is a very general view, don't you think? The one is an interface
 flag and the other one is an interface itself. This looks like a risky
 mixture, when there is no clean separation.

The better definition followed that IFF_LOOPBACK means all packets
will loopback.  Currently we only have one such device in the tree.
And the CAN devices do not have that property.

 So at a first glance the CAN usage of IFF_LOOPBACK looks completely
 broken, and likely to confuse other networking layers if they see
 a CAN device.  Say if someone attempts to run IP over CAN or
 something like that.
   

 The CAN protocol family is some kind of a closed ecosystem with a
 complete different addressing scheme that uses the bare networking
 functionality of the Linux Kernel as well as DECNET or ARCNET. You would
 never been able to run the IP-stack on a CAN netdev (ARPHDR_CAN,
 ETH_P_CAN) due to several technical differences in addressing, etc.

However when register_netdev is the netdev_notifier chain is called
with NETDEV_REGISTER

So then we enter code paths such as net/ipv4/inetdev_event() and
process the network device.  There is some small amount of treatment
given to devices that have IFF_LOOPBACK set.

The core point being that CAN devices as currently constructed are not
in a closed ecosystem.  Other networking layers see them even if they
can not use the properly.

I don't know what all of the implications are but I do know we
need to be careful.


 Do you think you can remove this incompatible usage of IFF_LOOPBACK
 from the can code?
   

 Yes. We might pick another name for it (see below).

Thanks.

 If I have read your documentation properly the only reason you are
 doing this is so that the timing of frames to cansniffer more
 accurately reflects when the frame hits the wire.  If CAN runs over a
 very slow medium I guess I can see where that can be a concern.


 So it's not an issue of having better timings but to reproduce the
 *correct message order* from the CAN bus. One year ago this problem has
 originally been pointed out by Michael Schulze from the University of
 Magdeburg as having a correct message order created by the CSMA/CA
 treatment is a vital requirement for CAN bus users. As you might see now
 the CAN netdriver has to offer additional functionalities due to the
 CSMA/CA treatment in opposite to the 'standard' CSMA/CD behaviour you
 know from Ethernet netdrivers. And this arbitration information of the
 CAN controller is only available on driver level. It is therefore no
 question IF the CAN netdriver supports the CSMA/CA treatment but HOW to
 provide an interface for this functionality on a basis of a standard
 netdriver (which simply only sends and receives frames).

Ok.  So the difference here is that CAN devices provide ordering on
the wire between their transmitted packets and their received packets,
and you need an additional hook so you can properly observe this case.

Hmm. My gut feel says that we just want function your drivers can
call post transmit that will call dev_queue_xmit_nit if someone else
is watching.  Although calling netif_rx seems to work but at least
in the general case with routing I would be concerned that would get
you into packets that would get routed out the incoming interface
and cause a loop.

 As the CAN netdrivers (as described above) are only available and used
 by the PF_CAN core, the use of IFF_LOOPBACK looked like reasonable
 solution to distinguish whether the CAN netdriver is capable to do the
 loopback (e.g. due to the ability of the controller to generate
 TX-interrupts) or not. The usage of IFF_LOOPBACK in CAN netdrivers
 didn't affect or confuse the rest of the Linux networking system up to
 now. Btw. if you state that IFF_LOOPBACK means for a netdriver, that
 all packets from a device will come right back to the current machine,
 and go nowhere else., we should think about a new IFF_-flag here.

Yes. IFF_LOOPBACK for a netdriver means that all packets from a device
will come right back to the current machine.   I stated that in a
later message, as that seems a clearer way to express what it means.

Further I still don't see any mechanism that isolates CAN netdrivers
from the other protocol layers.

What makes this problem a little stronger is that I have been
simplifying some of the tests in some of the other networking layers
from being if (dev == loopback_dev) to if (dev-flags 
IFF_LOOPBACK) So there are fewer special cases I need to deal with.

I believe that if you now use your current CAN patches unless I have
misread something your can devices will now show up with ip 127.0.0.1

 I don't have any concerns creating a new IFF_-flag for this loopback
 approved by CSMA/CA media access i just have no idea for a really good
 name for it. But maybe the use of IFF_LOOPBACK for CAN 

MSI interrupts and disable_irq

2007-09-28 Thread Ayaz Abdulla
I am trying to track down a forcedeth driver issue described by bug 9047 
in bugzilla (2.6.23-rc7-git1 forcedeth w/ MCP55 oops under heavy load). 
I added a patch to synchronize the timer handlers so that one handler 
doesn't accidently enable the IRQ while another timer handler is running 
(see attachment 'Add timer lock' in bug report) and for other processing 
protection.


However, the system still had an Oops. So I added a lock around the 
nv_rx_process_optimized() and the Oops has not happened (see attachment 
'New patch for locking' in bug report). This would imply a 
synchronization issue. However, the only callers of that function are 
the IRQ handler and the timer handlers (in non-NAPI case). The timer 
handlers  use disable_irq so that the IRQ handler does not contend with 
them. It looks as if disable_irq is not working properly.


This issue repros only with MSI interrupt and not legacy INTx 
interrupts. Any ideas?


Thanks,
Ayaz
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] sky2: fix VLAN receive processing

2007-09-28 Thread Stephen Hemminger
The length check for truncated frames was not correctly handling
the case where VLAN acceleration had already read the tag.
Also, the Yukon EX has some features that use high bit of status
as security tag.

Signed-off-by: Pierre-Yves Ritschard [EMAIL PROTECTED]
Signed-off-by: Stephen Hemminger [EMAIL PROTECTED]

--- a/drivers/net/sky2.c2007-09-28 09:13:38.0 -0700
+++ b/drivers/net/sky2.c2007-09-28 09:21:26.0 -0700
@@ -2049,6 +2049,7 @@ static struct sk_buff *sky2_receive(stru
struct sky2_port *sky2 = netdev_priv(dev);
struct rx_ring_info *re = sky2-rx_ring + sky2-rx_next;
struct sk_buff *skb = NULL;
+   u16 count;
 
if (unlikely(netif_msg_rx_status(sky2)))
printk(KERN_DEBUG PFX %s: rx slot %u status 0x%x len %d\n,
@@ -2063,7 +2064,13 @@ static struct sk_buff *sky2_receive(stru
if (!(status  GMR_FS_RX_OK))
goto resubmit;
 
-   if (status  16 != length)
+   count = (status  GMR_FS_LEN)  16;
+#ifdef SKY2_VLAN_TAG_USED
+   /* Account for vlan tag */
+   if (sky2-vlgrp  (status  GMR_FS_VLAN))
+   count -= VLAN_HLEN;
+#endif
+   if (count != length)
goto len_mismatch;
 
if (length  copybreak)

-- 
Stephen Hemminger [EMAIL PROTECTED]

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] sky2: reduce impact of watchdog timer

2007-09-28 Thread Stephen Hemminger
This is the 2.6.22 version of a regression fix that is already
in 2.6.23.  Change the watchdog timer form 10 per second all the time,
to 1 per second and only if interface is up.

Signed-off-by: Stephen Hemminger [EMAIL PROTECTED]


--- a/drivers/net/sky2.c2007-09-17 10:39:47.0 -0700
+++ b/drivers/net/sky2.c2007-09-28 09:13:38.0 -0700
@@ -96,10 +96,6 @@ static int disable_msi = 0;
 module_param(disable_msi, int, 0);
 MODULE_PARM_DESC(disable_msi, Disable Message Signaled Interrupt (MSI));
 
-static int idle_timeout = 100;
-module_param(idle_timeout, int, 0);
-MODULE_PARM_DESC(idle_timeout, Watchdog timer for lost interrupts (ms));
-
 static const struct pci_device_id sky2_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9000) }, /* SK-9Sxx */
{ PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9E00) }, /* SK-9Exx */
@@ -1693,6 +1689,8 @@ static void sky2_link_up(struct sky2_por
 
netif_carrier_on(sky2-netdev);
 
+   mod_timer(hw-watchdog_timer, jiffies + 1);
+
/* Turn on link LED */
sky2_write8(hw, SK_REG(port, LNK_LED_REG),
LINKLED_ON | LINKLED_BLINK_OFF | LINKLED_LINKSYNC_OFF);
@@ -2384,25 +2382,25 @@ static void sky2_le_error(struct sky2_hw
sky2_write32(hw, Q_ADDR(q, Q_CSR), BMU_CLR_IRQ_CHK);
 }
 
-/* If idle then force a fake soft NAPI poll once a second
- * to work around cases where sharing an edge triggered interrupt.
- */
-static inline void sky2_idle_start(struct sky2_hw *hw)
-{
-   if (idle_timeout  0)
-   mod_timer(hw-idle_timer,
- jiffies + msecs_to_jiffies(idle_timeout));
-}
-
-static void sky2_idle(unsigned long arg)
+/* Force a fake soft NAPI poll to handle lost IRQ's */
+static void sky2_watchdog(unsigned long arg)
 {
struct sky2_hw *hw = (struct sky2_hw *) arg;
struct net_device *dev = hw-dev[0];
+   int i, active = 0;
 
if (__netif_rx_schedule_prep(dev))
__netif_rx_schedule(dev);
 
-   mod_timer(hw-idle_timer, jiffies + msecs_to_jiffies(idle_timeout));
+   for (i = 0; i  hw-ports; i++) {
+   dev = hw-dev[i];
+   if (!netif_running(dev))
+   continue;
+   ++active;
+   }
+
+   if (active)
+   mod_timer(hw-watchdog_timer, round_jiffies(jiffies + HZ));
 }
 
 /* Hardware/software error handling */
@@ -2692,8 +2690,6 @@ static void sky2_restart(struct work_str
 
dev_dbg(hw-pdev-dev, restarting\n);
 
-   del_timer_sync(hw-idle_timer);
-
rtnl_lock();
sky2_write32(hw, B0_IMSK, 0);
sky2_read32(hw, B0_IMSK);
@@ -2722,8 +2718,6 @@ static void sky2_restart(struct work_str
}
}
 
-   sky2_idle_start(hw);
-
rtnl_unlock();
 }
 
@@ -3713,11 +3707,9 @@ static int __devinit sky2_probe(struct p
sky2_show_addr(dev1);
}
 
-   setup_timer(hw-idle_timer, sky2_idle, (unsigned long) hw);
+   setup_timer(hw-watchdog_timer, sky2_watchdog, (unsigned long) hw);
INIT_WORK(hw-restart_work, sky2_restart);
 
-   sky2_idle_start(hw);
-
pci_set_drvdata(pdev, hw);
 
return 0;
@@ -3752,7 +3744,7 @@ static void __devexit sky2_remove(struct
if (!hw)
return;
 
-   del_timer_sync(hw-idle_timer);
+   del_timer_sync(hw-watchdog_timer);
 
flush_scheduled_work();
 
@@ -3796,7 +3788,7 @@ static int sky2_suspend(struct pci_dev *
if (!hw)
return 0;
 
-   del_timer_sync(hw-idle_timer);
+   del_timer_sync(hw-watchdog_timer);
netif_poll_disable(hw-dev[0]);
 
for (i = 0; i  hw-ports; i++) {
@@ -3862,7 +3854,7 @@ static int sky2_resume(struct pci_dev *p
}
 
netif_poll_enable(hw-dev[0]);
-   sky2_idle_start(hw);
+
return 0;
 out:
dev_err(pdev-dev, resume failed (%d)\n, err);
@@ -3879,7 +3871,6 @@ static void sky2_shutdown(struct pci_dev
if (!hw)
return;
 
-   del_timer_sync(hw-idle_timer);
netif_poll_disable(hw-dev[0]);
 
for (i = 0; i  hw-ports; i++) {
--- a/drivers/net/sky2.h2007-08-21 10:59:11.0 -0700
+++ b/drivers/net/sky2.h2007-09-28 09:13:15.0 -0700
@@ -1921,7 +1921,7 @@ struct sky2_hw {
u32  st_idx;
dma_addr_t   st_dma;
 
-   struct timer_listidle_timer;
+   struct timer_listwatchdog_timer;
struct work_struct   restart_work;
int  msi;
wait_queue_head_tmsi_wait;

-- 
Stephen Hemminger [EMAIL PROTECTED]

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] sky2: patches for 2.6.22.y

2007-09-28 Thread Stephen Hemminger
Fixes for power regression, VLAN and resume problems.
These are all in 2.6.23.

-- 
Stephen Hemminger [EMAIL PROTECTED]

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] net: Add network namespace clone unshare support.

2007-09-28 Thread Cedric Le Goater
Eric W. Biederman wrote:
 David Miller [EMAIL PROTECTED] writes:
 
 Eric, pick an appropriate new non-conflicting number NOW.
 
 Done.  My apologies for the confusion.  I thought the
 way Cedric and the IBM guys were testing someone would have
 shouted at me long before now.

 This adds unnecessary extra work for Andrew Morton, which he has
 enough of already.
 
 Cedric made a good point that we will have conflicts of code
 being added to the same place in nsproxy.c and the like.  So
 I copied Andrew to give him a heads up.

here's a suggestion,

we could keep the net namespace unshare patch out of david's tree,
let andrew merge and release a new -mm and, then, send the net namespace 
unshare patch to andrew. that should keep nsproxy out of the andrew's 
merge challenge. But david's tree will miss the unshare part for a while.

As for the clone flags, the values *must not* conflict but the patches 
probably will.

C.

 I will gladly do what I can, to help.  Working against 3 trees
 development at the moment is a bit of a development challenge.
 
 Eric
 ___
 Containers mailing list
 [EMAIL PROTECTED]
 https://lists.linux-foundation.org/mailman/listinfo/containers
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PKT_SCHED]: Add stateless NAT

2007-09-28 Thread Evgeniy Polyakov
On Fri, Sep 28, 2007 at 12:19:19PM +0400, Evgeniy Polyakov ([EMAIL PROTECTED]) 
wrote:
  I still think the nf_*() prefixes should all go and the extern
  prototypes should go into an independant header file.
  
  These are not netfilter routines, they are INET helpers.
  
  And we should make similar treatment for all of the ipv6
  packet parser helper functions that ipv6 netfilter needs.
 
 Should netfilter still have own nf_ prefixed functions which will just
 call the same ones without prefix from inet header?

Kind of.

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 1dd075e..befdb82 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -291,9 +291,7 @@ extern int skb_make_writable(struct sk_buff **pskb, 
unsigned int writable_len);
 
 static inline void nf_csum_replace4(__sum16 *sum, __be32 from, __be32 to)
 {
-   __be32 diff[] = { ~from, to };
-
-   *sum = csum_fold(csum_partial((char *)diff, sizeof(diff), 
~csum_unfold(*sum)));
+   csum_replace(sum, from, to);
 }
 
 static inline void nf_csum_replace2(__sum16 *sum, __be16 from, __be16 to)
@@ -301,13 +299,17 @@ static inline void nf_csum_replace2(__sum16 *sum, __be16 
from, __be16 to)
nf_csum_replace4(sum, (__force __be32)from, (__force __be32)to);
 }
 
-extern void nf_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
- __be32 from, __be32 to, int pseudohdr);
+static inline void nf_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
+   __be32 from, __be32 to, int pseudohdr)
+{
+   proto_csum_replace(sum, skb, (__force __be32)from,
+   (__force __be32)to, pseudohdr);
+}
 
 static inline void nf_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb,
  __be16 from, __be16 to, int pseudohdr)
 {
-   nf_proto_csum_replace4(sum, skb, (__force __be32)from,
+   proto_csum_replace(sum, skb, (__force __be32)from,
(__force __be32)to, pseudohdr);
 }
 
diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 66ca8e3..e0561ea 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -72,8 +72,11 @@ enum nf_ip6_hook_priorities {
 
 #ifdef CONFIG_NETFILTER
 extern int ip6_route_me_harder(struct sk_buff *skb);
-extern __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
-   unsigned int dataoff, u_int8_t protocol);
+static inline __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
+   unsigned int dataoff, u_int8_t protocol)
+{
+   return ip6_checksum(skb, hook, dataoff, protocol);
+}
 
 extern int ipv6_netfilter_init(void);
 extern void ipv6_netfilter_fini(void);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a656cec..95ad5af 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1788,5 +1788,11 @@ static inline void skb_forward_csum(struct sk_buff *skb)
skb-ip_summed = CHECKSUM_NONE;
 }
 
+extern void proto_csum_replace(__sum16 *sum, struct sk_buff *skb,
+ __be32 from, __be32 to, int pseudohdr);
+
+extern __sum16 ip6_checksum(struct sk_buff *skb, unsigned int hook,
+unsigned int dataoff, u_int8_t protocol);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SKBUFF_H */
diff --git a/include/net/checksum.h b/include/net/checksum.h
index 1242461..8602189 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -93,4 +93,12 @@ static inline __wsum csum_unfold(__sum16 n)
 }
 
 #define CSUM_MANGLED_0 ((__force __sum16)0x)
+
+static inline void csum_replace(__sum16 *sum, __be32 from, __be32 to)
+{
+   __be32 diff[] = { ~from, to };
+
+   *sum = csum_fold(csum_partial((char *)diff, sizeof(diff), 
~csum_unfold(*sum)));
+}
+
 #endif
diff --git a/net/core/utils.c b/net/core/utils.c
index 0bf17da..17576c8 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -293,3 +293,54 @@ out:
 }
 
 EXPORT_SYMBOL(in6_pton);
+
+void proto_csum_replace(__sum16 *sum, struct sk_buff *skb,
+   __be32 from, __be32 to, int pseudohdr)
+{
+   __be32 diff[] = { ~from, to };
+   if (skb-ip_summed != CHECKSUM_PARTIAL) {
+   *sum = csum_fold(csum_partial(diff, sizeof(diff),
+   ~csum_unfold(*sum)));
+   if (skb-ip_summed == CHECKSUM_COMPLETE  pseudohdr)
+   skb-csum = ~csum_partial(diff, sizeof(diff),
+   ~skb-csum);
+   } else if (pseudohdr)
+   *sum = ~csum_fold(csum_partial(diff, sizeof(diff),
+   csum_unfold(*sum)));
+}
+EXPORT_SYMBOL(proto_csum_replace);
+
+__sum16 ip6_checksum(struct sk_buff *skb, unsigned int hook,
+unsigned int dataoff, u_int8_t protocol)
+{
+   

Re: [RFC/PATCH 3/3] UDP memory usage accounting (take 2): measurement

2007-09-28 Thread Evgeniy Polyakov
On Fri, Sep 28, 2007 at 10:41:31PM +0900, Satoshi OSHIMA ([EMAIL PROTECTED]) 
wrote:
 This patch introduces memory usage measurement for UDP.
 
 These 3 points were updated.
 
 - UDP specific codes in IP layer were removed.
 
 - atomic_sub() in a loop was removed
 
 - accounting during socket destruction

Another approach is to account only at the highest UDP layer and having
datagram skb destructor just like it is done in TCP, but this approach
is also resonable.

I already told that patches 1 and 3 have broken indent, please fix that.

A hint: when you are about to submit something network related for inclusion,
and strongly believes it is ready, it can be a not that bad idea to add 
David Miller [EMAIL PROTECTED] to copy list, he can complain about
backlog and so on, but will read you mail twice :) but do not tell anyone.

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23-rc8 network problem. Mem leak? ip1000a?

2007-09-28 Thread Andrew Morton
On 27 Sep 2007 22:06:17 -0400 [EMAIL PROTECTED] wrote:

 Uniprocessor Althlon 64, 64-bit kernel, 2G ECC RAM,
 2.6.23-rc8 + linuxpps (5.0.0) + ip1000a driver.
 (patch from http://marc.info/?l=linux-netdevm=118980588419882)
 
 After a few hours of operation, ntp loses the ability to send packets.
 sendto() returns -EAGAIN to everything, including the 24-byte UDP packet
 that is a response to ntpq.
 
 ...

 Killing and restarting ntpd gets it running again for a few hours.
 Here's after about two hours of successful operation.  (I'll try to
 remember to run slabinfo before killing ntpd next time.)

ntpd.  Sounds like pps leaking to me.

 
 Can anyone offer some diagnosis advice?
 

CONFIG_DEBUG_SLAB_LEAK?
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/7] CAN: Add PF_CAN core module

2007-09-28 Thread Thomas Gleixner
On Tue, 2007-09-25 at 14:09 -0700, David Miller wrote:
   Then please make all exported symbols marked EXPORT_SYMBOL_GPL to make
   sure that the other CAN protocol can not reuse your infrastructure.
  
  We don't want to force other CAN protocol implementations to be GPL
  also.  AFAIR from discussions on LKML, it was mostly agreed upon that
  this decision is up to the authors of code.
 
 To a certain extent, yes.
 
 However, the core issue is whether anyone who uses the symbol
 is creating a derivative work.  If it is pretty clear that this
 is the case, you really should mark the exported symbols GPL.
 
 In my opinion, in this case it is pretty clear that any use of
 these new symbols would be a derivative work and therefore they
 all should be marked GPL.

Hmm, the code in question is dual licensed. So it's not that clear to
me. 

But it's a legal grey area anyway if somebody loads non GPL code into
the kernel, though IANAL and we can spend years on this discussion.

I'm not inclined either way and we really should not make this a
religious question whether that code goes in or not, especially not when
we granted the mac80211 to export everything w/o _GPL suffix not too
long ago.

Thanks,

tglx


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [stable] Upgrading 2.6.21.7-2.6.22.9 kills my network (sky2): sky2 eth0: rx error, status 0x402300 length 60

2007-09-28 Thread Greg KH
On Fri, Sep 28, 2007 at 01:11:27PM +0200, Krzysztof Oledzki wrote:


 On Fri, 28 Sep 2007, Krzysztof Oledzki wrote:



 On Fri, 28 Sep 2007, Krzysztof Oledzki wrote:

 Hello,
 After upgrading my kernel from 2.6.21.7 to 2.6.22.9 my 88E8053 no longer 
 works:

 Small update: 2.6.22.9 with sky2.c/sky2.h from 2.4.22.4 works without any 
 problems.

 Final update.

 Reverting this patch: 
 http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.22.y.git;a=commitdiff_plain;h=8c07a8e30ba8a2e0831da4b134202598435f8358
 solved my problem.

 I also found this one:

 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=d6532232cd3de79c852685823a9c52f723816d0a

 Could it go to a next -stable ASAP, please? It seems that 2.6.22.5-2.6.22.9 
 kernels have broken sky2 if used with vlans. :( Such regression in a 
 -stable kernel isn't nice. :(

So should we just apply the second patch?  I'll let Stephen tell us what
we should do :)

thanks,

greg k-h
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: UDP Out 0f Sequence

2007-09-28 Thread Randy Macleod

Rick Jones wrote:

Majumder, Rajib wrote:

Let's say we have 2 uniprocessor hosts connected back to back. Is
there any possibility of an out-of-order scenario on recv? 


Your application should be written on the assumption that it is 
possible, regardless of the specifics of the hosts involved, however 
unlikely they may be to reorder traffic.


  Is this same for all kernel (linux/solaris)?

Your application should be written on the assumtion that it is possible, 
regardless of the specifics of the OSes involved, however unlikely they 
may be to reorder traffic.


Or you should use a different protocol stack that guarantees order.
Have you considered TCP, DCCP or even TIPC?
The solution depends on your application requirements and your
projects tolerance for risk.

// Randy

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


/proc/net/ bad hard links count [Was: 2.6.23-rc8-mm2]

2007-09-28 Thread Jiri Slaby
On 09/27/2007 11:22 AM, Andrew Morton wrote:
 ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.23-rc8/2.6.23-rc8-mm2/

# find /proc /dev/null
find: WARNING: Hard link count is wrong for /proc/net: this may be a bug in your
filesystem driver.  Automatically turning on find's -noleaf option.  Earlier
results may have failed to include directories that should have been searched.
# stat net
  File: `net'
  Size: 0   Blocks: 0  IO Block: 1024   directory
Device: 3h/3d   Inode: 4026531864  Links: 2
Access: (0555/dr-xr-xr-x)  Uid: (0/root)   Gid: (0/root)
Access: 2007-09-28 18:21:24.651209759 +0200
Modify: 2007-09-28 18:21:24.651209759 +0200
Change: 2007-09-28 18:21:24.651209759 +0200
# stat net/
  File: `net/'
  Size: 0   Blocks: 0  IO Block: 1024   directory
Device: 3h/3d   Inode: 4026531909  Links: 4
Access: (0555/dr-xr-xr-x)  Uid: (0/root)   Gid: (0/root)
Access: 2007-09-28 18:26:48.813048220 +0200
Modify: 2007-09-28 18:26:48.813048220 +0200
Change: 2007-09-28 18:26:48.813048220 +0200

hmm, this is some kind of weirdness :)

regards,
-- 
Jiri Slaby ([EMAIL PROTECTED])
Faculty of Informatics, Masaryk University

-- 
Jiri Slaby ([EMAIL PROTECTED])
Faculty of Informatics, Masaryk University
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Fwd: [PATCH#2 3/4] [PPC] Compile fix for 8xx CPM Ehernet driver

2007-09-28 Thread Kumar Gala

Begin forwarded message:


From: Jochen Friedrich [EMAIL PROTECTED]
Date: September 24, 2007 12:15:35 PM CDT
To: [EMAIL PROTECTED]
Cc: [EMAIL PROTECTED], Marcelo Tosatti [EMAIL PROTECTED]
Subject: [PATCH#2 3/4] [PPC] Compile fix for 8xx CPM Ehernet driver


Jeff,

Please pick up for 2.6.23 if you don't mind.

- k




Add #include asm/cacheflush.h for flush_dcache_range
to make the driver compile again.

 CC  arch/ppc/8xx_io/enet.o
arch/ppc/8xx_io/enet.c: In function 'scc_enet_start_xmit':
arch/ppc/8xx_io/enet.c:240: error: implicit declaration of function
'flush_dcache_range'
make[1]: *** [arch/ppc/8xx_io/enet.o] Error 1
make: *** [arch/ppc/8xx_io] Error 2

Signed-off-by: Jochen Friedrich [EMAIL PROTECTED]
---
arch/ppc/8xx_io/enet.c |1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/arch/ppc/8xx_io/enet.c b/arch/ppc/8xx_io/enet.c
index 703d47e..eace3bc 100644
--- a/arch/ppc/8xx_io/enet.c
+++ b/arch/ppc/8xx_io/enet.c
@@ -44,6 +44,7 @@
 #include asm/mpc8xx.h
 #include asm/uaccess.h
 #include asm/commproc.h
+#include asm/cacheflush.h

 /*
  * Theory of Operation



-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH]: Second preliminary release of Sun Neptune driver

2007-09-28 Thread Matheos Worku

David Miller wrote:

Ok, this is rev2, changes:

1) Jumbo MTU support is now present.  I got stuck on this for
   a while because I didn't realize that resetting the RX
   XMAC would reset the XMAC_MAX register.  I thought that
   was a TX XMAC resource, ho hum...

   Fix this by re-initializing the TX MAC after resetting
   the RX MAC.

2) Implementing support for jumbograms required a re-examination of
   how TX queue handling was done.

   Neptune can report the TX Head at a location which is in the middle
   of a packet's group of descriptors for a multi segment packet.
   Working around this would just overly complicate the code.

   We don't need to use the TX Head register.  Instead, track the
   pkt_cnt field of the TX_CS register.  The difference since the
   last reading is the number of full TX frames we can reclaim from
   the ring.

   This is not only simpler, it allows us to only need one MMIO
   access (for TX_CS) during a reclaim run instead of two (TX_CS
   and TX_RING_HDL).

   A side note, I'm purposefully not using TX mailbox support.  It's
   broken in Neptune.  If you get the registers in the DMA mailbox, MK
   and MMK are always both set in the TX_CS register.  So even if you
   write the MK and MB bits back to TX_CS to clear the interrupt, the
   MMK propagates to MK and you thus get another interrupt.  This
   basically makes TX DMA mailbox support useless.

4) All register write accessors are now of the form xxx(reg, val)
   instead of xxx(val, reg)

5) Kill PCI_DEVICE_ID_SUN_NEPTUNE define, unneeded.

Performance isn't the best, but the driver is reasonably solid.  After
I flesh out the remaining features that need to be implemented I'll
take a closer look at that.

Enjoy.

commit 684a7c25fff607dc647f065761bb381f28bddbdb
Author: David S. Miller [EMAIL PROTECTED]
Date:   Thu Sep 27 21:43:57 2007 -0700

[NIU]: Add Sun Neptune ethernet driver.

Signed-off-by: David S. Miller [EMAIL PROTECTED]


diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 467532c..3c94c8f 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2601,6 +2601,13 @@ config NETXEN_NIC
help
  This enables the support for NetXen's Gigabit Ethernet card.
 
+config NIU

+   tristate Sun Neptune 10Gbit Ethernet support
+   depends on PCI
+   help
+ This enables support for cards based upon Sun's
+ Neptune chipset.
+
 config PASEMI_MAC
tristate PA Semi 1/10Gbit MAC
depends on PPC64  PCI
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 6220c50..cce379b 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -242,3 +242,4 @@ obj-$(CONFIG_NETCONSOLE) += netconsole.o
 obj-$(CONFIG_FS_ENET) += fs_enet/
 
 obj-$(CONFIG_NETXEN_NIC) += netxen/

+obj-$(CONFIG_NIU) += niu.o
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
new file mode 100644
index 000..5b89559
--- /dev/null
+++ b/drivers/net/niu.c
  


Dave,
Couple of comments on Jumbo support

TX Side:
Can we rule of fragment size  MAX_TX_DESC_LEN? If that is not the case, 
then the frags my need the same tx post and reclaim logic as the skb-data


RX Side
   Since the MAC is set not to strip FCS bytes, the last page could 
contain just 1 - 4 bytes of FCS. Not only this wastes the page but may 
result on bug RX process where skb-len is wrong and a page with junk is 
passed to the stack. In our off-the-tree driver, we check this condition 
and repost the last page back to the rbr, if it contains just the FCS bytes.


Regards,
Matheos

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23-rc8: cannot make netconsole work

2007-09-28 Thread Andrey Borzenkov
On Friday 28 September 2007, Matt Mackall wrote:
 On Fri, Sep 28, 2007 at 01:27:55PM +0400, Andrey Borzenkov wrote:
[...]
  sudo modprobe netconsole netconsole=@/eth0,@/
 
[...]
 What is your console log level set to? If the messages don't come out
 on the local console, they won't get sent out the network either.
 Fedora at least defaults to hiding most messages. Adding 'debug' to
 your kernel command line will change that.


Well, that was one thing; but for whatever reason it would refuse to work 
without explicit MAC of recipient. Adding it made it (but it was of no help 
to my original issue ... well, that is another story ...)

Thank you

-andrey


signature.asc
Description: This is a digitally signed message part.


[RFC/PATCH 0/3] UDP memory usage accounting(take 2)

2007-09-28 Thread Satoshi OSHIMA
This patch set try to introduce memory usage accounting for 

UDP(currently ipv4 only).


3 points are improved along with some feedback.


(a) to improve scalability, avoiding atomic_*()s as small as

possible

(b) avoiding UDP specific code in IP layer

(c) supporting socket destruction accounting


To implement (b), there is a side effect which affects

accounting on TCP socket. If you find the good solution

to avoid this side effect, please let me know.


Unfortunately, I don't have any NIC with UFO.

So this patch set is not tested with UFO supported

device.


This patch set is for 2.6.23-rc8.


I appreciate your comment/test/feedback.


Satoshi Oshima

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [stable] Upgrading 2.6.21.7-2.6.22.9 kills my network (sky2): sky2 eth0: rx error, status 0x402300 length 60

2007-09-28 Thread Krzysztof Oledzki



On Fri, 28 Sep 2007, Greg KH wrote:


On Fri, Sep 28, 2007 at 01:11:27PM +0200, Krzysztof Oledzki wrote:



On Fri, 28 Sep 2007, Krzysztof Oledzki wrote:




On Fri, 28 Sep 2007, Krzysztof Oledzki wrote:


Hello,
After upgrading my kernel from 2.6.21.7 to 2.6.22.9 my 88E8053 no longer
works:


Small update: 2.6.22.9 with sky2.c/sky2.h from 2.4.22.4 works without any
problems.


Final update.

Reverting this patch:
http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.22.y.git;a=commitdiff_plain;h=8c07a8e30ba8a2e0831da4b134202598435f8358
solved my problem.

I also found this one:

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff_plain;h=d6532232cd3de79c852685823a9c52f723816d0a

Could it go to a next -stable ASAP, please? It seems that 2.6.22.5-2.6.22.9
kernels have broken sky2 if used with vlans. :( Such regression in a
-stable kernel isn't nice. :(


So should we just apply the second patch?  I'll let Stephen tell us what
we should do :)



Second patch works for me, so IMHO yes. Forget to mention that earlier, 
sorry. Ofcourse this should be the maintainer decision, this is only my 
vote. :)

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] sb1250-mac: Driver model phylib update

2007-09-28 Thread Andrew Morton
On Fri, 28 Sep 2007 17:23:00 +0100 (BST) Maciej W. Rozycki [EMAIL 
PROTECTED] wrote:

 On Mon, 24 Sep 2007, Andrew Morton wrote:
 
Well, this is against Jeff's netdev-2.6 tree which hopefully is not as 
   crufty as Linus's old mainline; if it is not possible to queue this 
   change 
   for 2.6.25 or suchlike, then I will try to resubmit later.
  
  Most of Jeff's netdev tree got dumped into Dave's net-2.6.24 tree.  That's
  the one you want to be raising patches against for the next few weeks.
 
  OK, thanks for clarification.  Then both patches already submitted:
 
 patch-netdev-2.6.23-rc6-20070920-sb1250-mac-typedef-9
 patch-netdev-2.6.23-rc6-20070920-sb1250-mac-29
 
 apply cleanly to net-2.6.24 one on top of the other in this order.

checks the netdev archives

hm, I found a patch at the end of an email trail which is datestamped Sep
20 here which appears to match the first one you mentioned, but I'm having
trouble working out what patch subject your sb1250-mac maps onto.

This is why I make the patch filenames map directly from the patch titles,
so I end up with files like
optimize-x86-page-faults-like-all-other-achitectures-and-kill-notifier-cruft.patch.
 Verbose, but it reduces confusion and mistakes.

  I can resubmit them

That's always a good choice.  Patches which are dangling at the end of an email
discussion often don't get merged: it is unclear to the receiveing party that
the discussion has terminated, and I'm never terribly confident in the testing
level of a patch which obviously got modified two minutes before it was sent.

 -- where?  netdev?  As I say I am fine with 2.6.25 as 
 the target.

jeff, netdev, me?  
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] sky2: fix VLAN receive processing

2007-09-28 Thread Krzysztof Oledzki



On Fri, 28 Sep 2007, Stephen Hemminger wrote:


The length check for truncated frames was not correctly handling
the case where VLAN acceleration had already read the tag.
Also, the Yukon EX has some features that use high bit of status
as security tag.



Thank you.



Best regards

Krzysztof Oledzki
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/7] CAN: Add virtual CAN netdevice driver

2007-09-28 Thread Oliver Hartkopp
Eric W. Biederman wrote:
 Oliver Hartkopp [EMAIL PROTECTED] writes:  
   
 The CAN protocol family is some kind of a closed ecosystem with a
 complete different addressing scheme that uses the bare networking
 functionality of the Linux Kernel as well as DECNET or ARCNET. You would
 never been able to run the IP-stack on a CAN netdev (ARPHDR_CAN,
 ETH_P_CAN) due to several technical differences in addressing, etc.
 

 However when register_netdev is the netdev_notifier chain is called
 with NETDEV_REGISTER

 So then we enter code paths such as net/ipv4/inetdev_event() and
 process the network device.  There is some small amount of treatment
 given to devices that have IFF_LOOPBACK set.
   

Yes. That's a good point. In the IPv4 NETDEV_REGISTER case the treatment
ends at the point, when (dev-mtu  68) is checked as the CAN MTU is 16.
But we also had a problem with the IPv6 NETDEV_REGISTER in addr_conf
(fixed in 2.6.21) that complained about an interface with a too small MTU.

 The core point being that CAN devices as currently constructed are not
 in a closed ecosystem.  Other networking layers see them even if they
 can not use the properly.

 I don't know what all of the implications are but I do know we
 need to be careful.
   

ACK.


 Hmm. My gut feel says that we just want function your drivers can
 call post transmit that will call dev_queue_xmit_nit if someone else
 is watching.  Although calling netif_rx seems to work but at least
 in the general case with routing I would be concerned that would get
 you into packets that would get routed out the incoming interface
 and cause a loop.
   

All incoming (and even the 'echoed') CAN frames are processed in
can_rcv() to check the various CAN filters the users may have registered
through their different open AF_CAN sockets. This is a one way receive
path, that leads to simple a kfree_skb() if there's no subscriber for
the received frame. So there is no way to send CAN frames to a CAN
interface when is is not explicitly triggered from the user. CAN has no
routing by design. After looking into dev_queue_xmit_nit() it looks like
netif_rx() was a good choice.


 Further I still don't see any mechanism that isolates CAN netdrivers
 from the other protocol layers.

 What makes this problem a little stronger is that I have been
 simplifying some of the tests in some of the other networking layers
 from being if (dev == loopback_dev) to if (dev-flags 
 IFF_LOOPBACK) So there are fewer special cases I need to deal with.

 I believe that if you now use your current CAN patches unless I have
 misread something your can devices will now show up with ip 127.0.0.1
   

No they don't due to the described mtu size verification. BUT i now get
an idea, why IFF_LOOPBACK was not that good approach ;-)

   
 I don't have any concerns creating a new IFF_-flag for this loopback
 approved by CSMA/CA media access i just have no idea for a really good
 name for it. But maybe the use of IFF_LOOPBACK for CAN netdrivers
 (ARPHRD_CAN) is also ok for you now?!?
 

 Serial devices tend to call this echo or local echo, so how about
 IFF_ECHO.
   

Excellent suggestion! If there are no remarks from other people, we can
change this in our next posting and add a new IFF_ECHO to if.h. Indeed
it's worth to look on the currently defined CAN RAW sockopts, the CAN
source and the docs to globally replace the 'loopback' with 'echo'. The
'local echo' hits the point really much better than to describe the
'loopback'-mechanic that's behind.

Thanks very much for your feedback  your time to review our stuff!

Oliver

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23-rc8-mm2 - tcp_fastretrans_alert() WARNING

2007-09-28 Thread Ilpo Järvinen
On Fri, 28 Sep 2007, Cedric Le Goater wrote:

 Hello ! 
 
 Andrew Morton wrote:
  ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.23-rc8/2.6.23-rc8-mm2/
 
 I just found that warning in my logs. It seems that it's been 
 happening since rc7-mm1 at least. 
 
 Thanks !
 
 C.
 
 WARNING: at /home/legoater/linux/2.6.23-rc8-mm2/net/ipv4/tcp_input.c:2314 
 tcp_fastretrans_alert()

 Call Trace:
  IRQ  [8040fdc3] tcp_ack+0xcd6/0x1894
 ...snip...

...Thanks for the report, I'll have look what could still break 
fackets_out...

-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [stable] [PATCH 2/3] sky2: fix VLAN receive processing

2007-09-28 Thread Chris Wright
* Stephen Hemminger ([EMAIL PROTECTED]) wrote:
 The length check for truncated frames was not correctly handling
 the case where VLAN acceleration had already read the tag.
 Also, the Yukon EX has some features that use high bit of status
 as security tag.

Did you leave out the GMR_FS_LEN change on purpose?  AFAICT, w/out
that you miss the Yukon EX high bit usage.  The upstream patch applies,
can we simply use that one (below rediffed for stable)?

thanks,
-chris
--

From d6532232cd3de79c852685823a9c52f723816d0a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger [EMAIL PROTECTED]
Date: Wed, 19 Sep 2007 15:36:42 -0700
Subject: sky2: fix VLAN receive processing (resend)

The length check for truncated frames was not correctly handling
the case where VLAN acceleration had already read the tag.
Also, the Yukon EX has some features that use high bit of status
as security tag.

Signed-off-by: Pierre-Yves Ritschard [EMAIL PROTECTED]
Signed-off-by: Stephen Hemminger [EMAIL PROTECTED]
Signed-off-by: Jeff Garzik [EMAIL PROTECTED]
---
 drivers/net/sky2.c |   14 +-
 drivers/net/sky2.h |2 +-
 2 files changed, 14 insertions(+), 2 deletions(-)

--- linux-2.6.22.9.orig/drivers/net/sky2.c
+++ linux-2.6.22.9/drivers/net/sky2.c
@@ -2049,6 +2049,13 @@ static struct sk_buff *sky2_receive(stru
struct sky2_port *sky2 = netdev_priv(dev);
struct rx_ring_info *re = sky2-rx_ring + sky2-rx_next;
struct sk_buff *skb = NULL;
+   u16 count = (status  GMR_FS_LEN)  16;
+
+#ifdef SKY2_VLAN_TAG_USED
+   /* Account for vlan tag */
+   if (sky2-vlgrp  (status  GMR_FS_VLAN))
+   count -= VLAN_HLEN;
+#endif
 
if (unlikely(netif_msg_rx_status(sky2)))
printk(KERN_DEBUG PFX %s: rx slot %u status 0x%x len %d\n,
@@ -2063,7 +2070,8 @@ static struct sk_buff *sky2_receive(stru
if (!(status  GMR_FS_RX_OK))
goto resubmit;
 
-   if (status  16 != length)
+   /* if length reported by DMA does not match PHY, packet was truncated */
+   if (length != count)
goto len_mismatch;
 
if (length  copybreak)
@@ -2079,6 +2087,10 @@ len_mismatch:
/* Truncation of overlength packets
   causes PHY length to not match MAC length */
++sky2-net_stats.rx_length_errors;
+   if (netif_msg_rx_err(sky2)  net_ratelimit())
+   pr_info(PFX %s: rx length mismatch: length %d status %#x\n,
+   dev-name, length, status);
+   goto resubmit;
 
 error:
++sky2-net_stats.rx_errors;
--- linux-2.6.22.9.orig/drivers/net/sky2.h
+++ linux-2.6.22.9/drivers/net/sky2.h
@@ -1579,7 +1579,7 @@ enum {
 
 /* Receive Frame Status Encoding */
 enum {
-   GMR_FS_LEN  = 0x16, /* Bit 31..16:Rx Frame Length */
+   GMR_FS_LEN  = 0x7fff16, /* Bit 30..16:Rx Frame Length */
GMR_FS_VLAN = 113, /* VLAN Packet */
GMR_FS_JABBER   = 112, /* Jabber Packet */
GMR_FS_UN_SIZE  = 111, /* Undersize Packet */
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [stable] [PATCH 2/3] sky2: fix VLAN receive processing

2007-09-28 Thread Stephen Hemminger
On Fri, 28 Sep 2007 12:20:44 -0700
Chris Wright [EMAIL PROTECTED] wrote:

 * Stephen Hemminger ([EMAIL PROTECTED]) wrote:
  The length check for truncated frames was not correctly handling
  the case where VLAN acceleration had already read the tag.
  Also, the Yukon EX has some features that use high bit of status
  as security tag.
 
 Did you leave out the GMR_FS_LEN change on purpose?  AFAICT, w/out
 that you miss the Yukon EX high bit usage.  The upstream patch applies,
 can we simply use that one (below rediffed for stable)?
 
 thanks,
 -chris


I left it out on purpose because 2.6.22 doesn't support Yukon EX.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ofa-general] [PATCH v3] iw_cxgb3: Supportiwarp-onlyinterfacesto avoid 4-tuple conflicts.

2007-09-28 Thread Steve Wise



Kanevsky, Arkady wrote:

Sean,
IB aside,
it looks like an ULP which is capable of being both RDMA aware and RDMA
not-aware,
like iSER and iSCSI, NFS-RDMA and NFS, SDP and sockets, 
will be treated as two separete ULPs.

Each has its own IP address, since there is a different IP address for
iWARP
port and regular Ethernet port. So it falls on the users of ULPs to
handle it
via DNS or some other services.
Is this acceptable to users? I doubt it.

Recall that ULPs are going in opposite directions by having a different
port number for RDMA aware and RDMA unaware versions of the ULP.
This way, ULP connection manager handles RDMA-ness under the covers,
while users plug an IP address for a server to connect to.
Thanks,


Arkady, I'm confused about how this proposed design changes the behavior 
of the ULPs that run on TCP and iWARP.  I don't see much difference from 
the point of view of the ULPs.


The NFS-RDMA server, for example, will not need to change since it binds 
to address 0.0.0.0 which will translate into a bind/listen on the 
specific iwarp address for each iwarp device on the rdma side, and 
address 0.0.0.0 for the TCP side.


Am I missing your point?

The real pain, IMO, with this solution is that it FORCES the admins to 
use 2 subnets when 1 is sufficient if the net maintainers would unify 
the port space...


Steve.





Arkady Kanevsky   email: [EMAIL PROTECTED]
Network Appliance Inc.   phone: 781-768-5395
1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195
Waltham, MA 02451   central phone: 781-768-5300
 


-Original Message-
From: Sean Hefty [mailto:[EMAIL PROTECTED] 
Sent: Thursday, September 27, 2007 3:12 PM

To: Kanevsky, Arkady; Sean Hefty; Steve Wise
Cc: netdev@vger.kernel.org; [EMAIL PROTECTED]; 
[EMAIL PROTECTED]; [EMAIL PROTECTED]
Subject: RE: [ofa-general] [PATCH v3] iw_cxgb3: 
Supportiwarp-onlyinterfacesto avoid 4-tuple conflicts.


What is the model on how client connects, say for iSCSI, when client 
and server both support, iWARP and 10GbE or 1GbE, and would like to 
setup most performant connection for ULP?
For the most performance connection, the ULP would use IB, 
and all these problems go away.  :)


This proposal is for each iwarp interface to have its own IP 
address.  Clients would need an iwarp usable address of the 
server and would connect using rdma_connect().  If that call 
(or rdma_resolve_addr/route) fails, the client could try 
connecting using sockets, aoi, or some other interface.  I 
don't see that Steve's proposal changes anything from the 
client's perspective.


- Sean
___
general mailing list
[EMAIL PROTECTED]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit 
http://openib.org/mailman/listinfo/openib-general



-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


2.6.23-rc[68]-mm: network hangs

2007-09-28 Thread Laurent Riffard
Hi,

From time to time, I experience some complete network hangs:

Suddenly, all network connections become unresponsive. Even ping
127.0.0.1 doesn't work. SysRq-w does not show any blocked processus.

When such hang happen, I have to reboot (shutdown does work).

This is not easily reproducible: it happens several minutes after 
boot (could be 45 minutes or 2 hours). I do not use heavy networking 
apps (like P2P). My typical usage is a Gnome desktop with  browser, 
mailer, IM, video or audio streaming. 

I have a single PC connected to a DSL router via ethernet (so no LAN 
with NFS or CIFS).

This happens with 2.6.23-rc8-mm2 and 2.6.23-rc6-mm1. I can't remember 
when I first see this problem. Maybe 2 months ago.

I attached the output of strace ping 127.0.0.1. How can I collect 
some more data when this problem happens ?

~~
laurent
execve(/bin/ping, [ping, 127.0.0.1], [/* 42 vars */]) = 0
brk(0)  = 0x93ac000
access(/etc/ld.so.nohwcap, F_OK)  = -1 ENOENT (No such file or directory)
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0xb7f9b000
access(/etc/ld.so.preload, R_OK)  = -1 ENOENT (No such file or directory)
open(/etc/ld.so.cache, O_RDONLY)  = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=57464, ...}) = 0
mmap2(NULL, 57464, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb7f8c000
close(3)= 0
access(/etc/ld.so.nohwcap, F_OK)  = -1 ENOENT (No such file or directory)
open(/lib/tls/i686/cmov/libresolv.so.2, O_RDONLY) = 3
read(3, \177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\0!\0\000..., 512) = 
512
fstat64(3, {st_mode=S_IFREG|0644, st_size=67408, ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0xb7f8b000
mmap2(NULL, 75976, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 
0xb7f78000
mmap2(0xb7f87000, 8192, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0xf) = 0xb7f87000
mmap2(0xb7f89000, 6344, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0xb7f89000
close(3)= 0
access(/etc/ld.so.nohwcap, F_OK)  = -1 ENOENT (No such file or directory)
open(/lib/tls/i686/cmov/libc.so.6, O_RDONLY) = 3
read(3, \177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\0`\1\000..., 512) = 
512
fstat64(3, {st_mode=S_IFREG|0644, st_size=1307104, ...}) = 0
mmap2(NULL, 1312164, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 
0xb7e37000
mmap2(0xb7f72000, 12288, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x13b) = 0xb7f72000
mmap2(0xb7f75000, 9636, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0xb7f75000
close(3)= 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0xb7e36000
set_thread_area({entry_number:-1 - 6, base_addr:0xb7e366c0, limit:1048575, 
seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1, seg_not_present:0, 
useable:1}) = 0
mprotect(0xb7f72000, 4096, PROT_READ)   = 0
munmap(0xb7f8c000, 57464)   = 0
socket(PF_INET, SOCK_RAW, IPPROTO_ICMP) = 3
getuid32()  = 0
setuid32(0) = 0
socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 4
connect(4, {sa_family=AF_INET, sin_port=htons(1025), 
sin_addr=inet_addr(127.0.0.1)}, 16) = 0
getsockname(4, {sa_family=AF_INET, sin_port=htons(44103), 
sin_addr=inet_addr(127.0.0.1)}, [16]) = 0
close(4)= 0
setsockopt(3, SOL_RAW, ICMP_FILTER, 
~(ICMP_ECHOREPLY|ICMP_DEST_UNREACH|ICMP_SOURCE_QUENCH|ICMP_REDIRECT|ICMP_TIME_EXCEEDED|ICMP_PARAMETERPROB),
 4) = 0
setsockopt(3, SOL_IP, IP_RECVERR, [1], 4) = 0
setsockopt(3, SOL_SOCKET, SO_SNDBUF, [324], 4) = 0
setsockopt(3, SOL_SOCKET, SO_RCVBUF, [65536], 4) = 0
getsockopt(3, SOL_SOCKET, SO_RCVBUF, [131072], [4]) = 0
brk(0)  = 0x93ac000
brk(0x93cd000)  = 0x93cd000
fstat64(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 0), ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0xb7f9a000
write(1, PING 127.0.0.1 (127.0.0.1) 56(84..., 49) = 49
setsockopt(3, SOL_SOCKET, SO_TIMESTAMP, [1], 4) = 0
setsockopt(3, SOL_SOCKET, SO_SNDTIMEO, \1\0\0\0\0\0\0\0, 8) = 0
setsockopt(3, SOL_SOCKET, SO_RCVTIMEO, \1\0\0\0\0\0\0\0, 8) = 0
getpid()= 9353
rt_sigaction(SIGINT, {0x804b5b0, [], SA_INTERRUPT}, NULL, 8) = 0
rt_sigaction(SIGALRM, {0x804b5b0, [], SA_INTERRUPT}, NULL, 8) = 0
rt_sigaction(SIGQUIT, {0x804b5c0, [], SA_INTERRUPT}, NULL, 8) = 0
gettimeofday({1190968694, 793502}, NULL) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, {B38400 opost isig icanon echo ...}) = 0
ioctl(1, TIOCGWINSZ, {ws_row=52, ws_col=157, ws_xpixel=0, ws_ypixel=0}) = 0
gettimeofday({1190968694, 793762}, NULL) = 0
gettimeofday({1190968694, 793824}, NULL) = 0
sendmsg(3, {msg_name(16)={sa_family=AF_INET, sin_port=htons(0), 
sin_addr=inet_addr(127.0.0.1)}, 

Re: [stable] [PATCH 2/3] sky2: fix VLAN receive processing

2007-09-28 Thread Chris Wright
* Stephen Hemminger ([EMAIL PROTECTED]) wrote:
 I left it out on purpose because 2.6.22 doesn't support Yukon EX.

OK, thanks.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC/PATCH 1/3] UDP memory usage accounting (take 2): fix send buffer check

2007-09-28 Thread David Miller
From: Satoshi OSHIMA [EMAIL PROTECTED]
Date: Fri, 28 Sep 2007 22:37:54 +0900

  } else if (i  MAX_SKB_FRAGS) {
 
 +if (atomic_read(sk-sk_wmem_alloc) + PAGE_SIZE
 
 + 2 * sk-sk_sndbuf) {
 
 +err = -ENOBUFS;
 
 +goto error;
 
 +}

Your email client, or something, is adding newlines every
other line of real content, making your patches unusable.

Please go and perform some testing to make sure that the
patches you email out are untouched by the email client and
can be applied by the recipient.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/7] CAN: Add PF_CAN core module

2007-09-28 Thread David Miller
From: Thomas Gleixner [EMAIL PROTECTED]
Date: Fri, 28 Sep 2007 18:27:19 +0200

 I'm not inclined either way and we really should not make this a
 religious question whether that code goes in or not, especially not when
 we granted the mac80211 to export everything w/o _GPL suffix not too
 long ago.

This is because a wireless driver is a driver.  It can exist outside
of the kernel and it's mac80211 stack just like any other network
device driver.  ANd the interfaces in mac80211 are such that the
driver doesn't explicitly go into the internals of the implementation.

That's not true with CAN.

With this CAN stuff, any driver you write for it is intimately
integrated into the design and architecture of the CAN subsystem.  Any
such driver cannot stand on it's own.  Look at how these drivers can
get into the internals.

If this code goes in without the _GPL() exports, that's fine, but it's
setting incorrect expectations for people who think they can write
binary-only drivers and link to these symbols.

And it will be the CAN folks who are guilty of setting these
false premises.  Especially after I've explicitly warned about
it here.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/7] CAN: Add PF_CAN core module

2007-09-28 Thread Thomas Gleixner
On Fri, 2007-09-28 at 13:20 -0700, David Miller wrote:
 That's not true with CAN.
 
 With this CAN stuff, any driver you write for it is intimately
 integrated into the design and architecture of the CAN subsystem.  Any
 such driver cannot stand on it's own.  Look at how these drivers can
 get into the internals.

I'm just concerned about protocols, which have been designed and
implemented long ago outside of the kernel and are going to be wrapped
with glue code to fit into the socket can implementation. That's hard to
judge.

 If this code goes in without the _GPL() exports, that's fine, but it's
 setting incorrect expectations for people who think they can write
 binary-only drivers and link to these symbols.
 
 And it will be the CAN folks who are guilty of setting these
 false premises.  Especially after I've explicitly warned about
 it here.

Fair enough.

tglx


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [ofa-general] [PATCH v3] iw_cxgb3: Supportiwarp-onlyinterfacesto avoid 4-tuple conflicts.

2007-09-28 Thread Kanevsky, Arkady
Exactly,
it forces the burden on administrator.
And one will be forced to try one mount for iWARP and it does not
work issue another one TCP or UDP if it fails.
Yack!

And server will need to listen on different IP address and simple
* will not work since it will need to listen in two different domains.

Had we run this proposal by administrators?
Thanks,

Arkady Kanevsky   email: [EMAIL PROTECTED]
Network Appliance Inc.   phone: 781-768-5395
1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195
Waltham, MA 02451   central phone: 781-768-5300
 

 -Original Message-
 From: Steve Wise [mailto:[EMAIL PROTECTED] 
 Sent: Friday, September 28, 2007 3:47 PM
 To: Kanevsky, Arkady
 Cc: Sean Hefty; Sean Hefty; netdev@vger.kernel.org; 
 [EMAIL PROTECTED]; [EMAIL PROTECTED]; 
 [EMAIL PROTECTED]
 Subject: Re: [ofa-general] [PATCH v3] iw_cxgb3: 
 Supportiwarp-onlyinterfacesto avoid 4-tuple conflicts.
 
 
 
 Kanevsky, Arkady wrote:
  Sean,
  IB aside,
  it looks like an ULP which is capable of being both RDMA aware and 
  RDMA not-aware, like iSER and iSCSI, NFS-RDMA and NFS, SDP and 
  sockets, will be treated as two separete ULPs.
  Each has its own IP address, since there is a different IP 
 address for 
  iWARP port and regular Ethernet port. So it falls on the users of 
  ULPs to handle it via DNS or some other services.
  Is this acceptable to users? I doubt it.
  
  Recall that ULPs are going in opposite directions by having a 
  different port number for RDMA aware and RDMA unaware 
 versions of the ULP.
  This way, ULP connection manager handles RDMA-ness under 
 the covers, 
  while users plug an IP address for a server to connect to.
  Thanks,
 
 Arkady, I'm confused about how this proposed design changes 
 the behavior of the ULPs that run on TCP and iWARP.  I don't 
 see much difference from the point of view of the ULPs.
 
 The NFS-RDMA server, for example, will not need to change 
 since it binds to address 0.0.0.0 which will translate into a 
 bind/listen on the specific iwarp address for each iwarp 
 device on the rdma side, and address 0.0.0.0 for the TCP side.
 
 Am I missing your point?
 
 The real pain, IMO, with this solution is that it FORCES the 
 admins to use 2 subnets when 1 is sufficient if the net 
 maintainers would unify the port space...
 
 Steve.
 
 
 
  
  Arkady Kanevsky   email: [EMAIL PROTECTED]
  Network Appliance Inc.   phone: 781-768-5395
  1601 Trapelo Rd. - Suite 16.Fax: 781-895-1195
  Waltham, MA 02451   central phone: 781-768-5300
   
  
  -Original Message-
  From: Sean Hefty [mailto:[EMAIL PROTECTED]
  Sent: Thursday, September 27, 2007 3:12 PM
  To: Kanevsky, Arkady; Sean Hefty; Steve Wise
  Cc: netdev@vger.kernel.org; [EMAIL PROTECTED]; 
  [EMAIL PROTECTED]; [EMAIL PROTECTED]
  Subject: RE: [ofa-general] [PATCH v3] iw_cxgb3: 
  Supportiwarp-onlyinterfacesto avoid 4-tuple conflicts.
 
  What is the model on how client connects, say for iSCSI, 
 when client 
  and server both support, iWARP and 10GbE or 1GbE, and 
 would like to 
  setup most performant connection for ULP?
  For the most performance connection, the ULP would use 
 IB, and all 
  these problems go away.  :)
 
  This proposal is for each iwarp interface to have its own 
 IP address.  
  Clients would need an iwarp usable address of the server and would 
  connect using rdma_connect().  If that call (or 
  rdma_resolve_addr/route) fails, the client could try 
 connecting using 
  sockets, aoi, or some other interface.  I don't see that Steve's 
  proposal changes anything from the client's perspective.
 
  - Sean
  ___
  general mailing list
  [EMAIL PROTECTED]
  http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
 
  To unsubscribe, please visit
  http://openib.org/mailman/listinfo/openib-general
 
 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ofa-general] [PATCH v3] iw_cxgb3: Supportiwarp-onlyinterfacesto avoid 4-tuple conflicts.

2007-09-28 Thread Steve Wise


Kanevsky, Arkady wrote:

Exactly,
it forces the burden on administrator.
And one will be forced to try one mount for iWARP and it does not
work issue another one TCP or UDP if it fails.
Yack!



I see your point.  I have no defense.  My hands have been tied on fixing 
this properly...



And server will need to listen on different IP address and simple
* will not work since it will need to listen in two different domains.



No, the server will listen on 0.0.0.0:2049 for TCP, and 0.0.0.0:2050 for 
rdma.  The rdma subsystem will translate 0.0.0.0:2050 into listens on 
specific iwarp ip addresses on every iwarp device...



Had we run this proposal by administrators?


There has been no other solution proposed that Dave Miller and Jeff 
Garzik won't NAK...


Steve.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: error(s) in 2.6.23-rc5 bonding.txt ?

2007-09-28 Thread Rick Jones
Well, I managed to concoct an updated test, this time with 1G's going into a 
10G.  A 2.6.23-rc8 kernel on the system with four, dual-port 82546GB's, 
connected to an HP ProCurve 3500 series switch with a 10G link to a system 
running 2.6.18-8.el5 (I was having difficulty getting cxgb3 going on my 
kernel.org kernels - firmware mismatches - so I booted RHEL5 there).


I put all four 1G interfaces into a balance_rr (mode=0) bond and started running 
just a single netperf TCP_STREAM test.


On the bonding side:

hpcpc103:~/net-2.6.24/Documentation/networking# netstat -s -t | grep retran
19050 segments retransmited
9349 fast retransmits
9698 forward retransmits
hpcpc103:~/net-2.6.24/Documentation/networking# ifconfig bond0 | grep pack
  RX packets:50708119 errors:0 dropped:0 overruns:0 frame:0
  TX packets:58801285 errors:0 dropped:0 overruns:0 carrier:0
hpcpc103:~/net-2.6.24/Documentation/networking# netperf -H 192.168.5.106
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.5.106 
(192.168.5.106) port 0 AF_INET

Recv   SendSend
Socket Socket  Message  Elapsed
Size   SizeSize Time Throughput
bytes  bytes   bytessecs.10^6bits/sec

 87380  16384  1638410.011267.99
hpcpc103:~/net-2.6.24/Documentation/networking# netstat -s -t | grep retran
20268 segments retransmited
9974 fast retransmits
10291 forward retransmits
hpcpc103:~/net-2.6.24/Documentation/networking# ifconfig bond0 | grep pack
  RX packets:51636421 errors:0 dropped:0 overruns:0 frame:0
  TX packets:59899089 errors:0 dropped:0 overruns:0 carrier:0

on the recieving side:

[EMAIL PROTECTED] ~]# ifconfig eth5 | grep pack
  RX packets:58802455 errors:0 dropped:0 overruns:0 frame:0
  TX packets:50205304 errors:0 dropped:0 overruns:0 carrier:0
[EMAIL PROTECTED] ~]# ifconfig eth5 | grep pack
  RX packets:59900267 errors:0 dropped:0 overruns:0 frame:0
  TX packets:51124138 errors:0 dropped:0 overruns:0 carrier:0

So, there were  20268 - 19050  or 1218 retransmissions during the test.  The 
sending side reported sending 59899089 - 58801285 or 1097804 packets, and the 
receiver reported receiving 59900267 - 58802455 or 1097812 packets.


Unless the switch was only occasionally duplicating segments or something, it 
looks like all the retransmissions were the result of duplicate ACKs from packet 
reordering.


For grins I varied the reordering sysctl and got:

# netstat -s -t | grep retran; for i in 3 4 5 6 7 8 9 10 20 30; do sysctl -w 
net.ipv4.tcp_reordering=$i; netperf -H 192.168.5.106 -P 0 -B reorder $i; 
netstat -s -t | grep retran; done

13735 segments retransmited
6581 fast retransmits
7151 forward retransmits
net.ipv4.tcp_reordering = 3
 87380  16384  1638410.011294.51   reorder 3
15127 segments retransmited
7330 fast retransmits
7794 forward retransmits
net.ipv4.tcp_reordering = 4
 87380  16384  1638410.011304.22   reorder 4
16103 segments retransmited
7807 fast retransmits
8293 forward retransmits
net.ipv4.tcp_reordering = 5
 87380  16384  1638410.011330.88   reorder 5
16763 segments retransmited
8155 fast retransmits
8605 forward retransmits
net.ipv4.tcp_reordering = 6
 87380  16384  1638410.011350.50   reorder 6
17134 segments retransmited
8356 fast retransmits
8775 forward retransmits
net.ipv4.tcp_reordering = 7
 87380  16384  1638410.011353.00   reorder 7
17492 segments retransmited
8553 fast retransmits
8936 forward retransmits
net.ipv4.tcp_reordering = 8
 87380  16384  1638410.011358.00   reorder 8
17649 segments retransmited
8625 fast retransmits
9021 forward retransmits
net.ipv4.tcp_reordering = 9
 87380  16384  1638410.011415.89   reorder 9
17736 segments retransmited
8666 fast retransmits
9067 forward retransmits
net.ipv4.tcp_reordering = 10
 87380  16384  1638410.011412.36   reorder 10
17773 segments retransmited
8684 fast retransmits
9086 forward retransmits
net.ipv4.tcp_reordering = 20
 87380  16384  1638410.011403.47   reorder 20
17773 segments retransmited
8684 fast retransmits
9086 forward retransmits
net.ipv4.tcp_reordering = 30
 87380  16384  1638410.011325.41   reorder 30
17773 segments retransmited
8684 fast retransmits
9086 forward retransmits

IE fast retrans from reordering until the reorder limit was reasonably well 
above the number of links in the aggregate.


As for how things got reordered, knuth knows exactly why.  But it didn't need 
more than one connection, and that connection didn't have to vary the size of 
what it was passing to send(). Netperf was not making send calls which were an 
integral multiple of the MSS, which means that from time to time a short segment 
would be queued to an interface in the bond. Also, two of the dual-port NICs 
were on 66 MHz  PCI-X busses, and 

Re: [ofa-general] [PATCH v3] iw_cxgb3: Supportiwarp-onlyinterfacesto avoid 4-tuple conflicts.

2007-09-28 Thread Sean Hefty

Kanevsky, Arkady wrote:

Exactly,
it forces the burden on administrator.
And one will be forced to try one mount for iWARP and it does not
work issue another one TCP or UDP if it fails.
Yack!

And server will need to listen on different IP address and simple
* will not work since it will need to listen in two different domains.


The server already has to call listen twice.  Once for the rdma_cm and 
once for sockets.  Similarly on the client side, connect must be made 
over rdma_cm or sockets.  I really don't see any impact on the 
application for this approach.


We just end up separating the port space based on networking addresses, 
rather than keeping the problem at the transport level.  If you have an 
alternate approach that will be accepted upstream, feel free to post it.


- Sean
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] IPoIB: Convert to netdevice internal stats

2007-09-28 Thread Roland Dreier
Use the stats member of struct netdevice in IPoIB, so we can save
memory by deleting the stats member of struct ipoib_dev_priv, and save
code by deleting ipoib_get_stats().

Signed-off-by: Roland Dreier [EMAIL PROTECTED]
---
Dave, can you queue this in net-2.6.24 please?  I would ordinarily
merge IPoIB changes but since this depends on the netdevice internal
stats change it becomes a cross-tree dependency if I try to do that.
And I'd like to get it queued in git now before the merge window.

Thanks...

 drivers/infiniband/ulp/ipoib/ipoib.h   |2 --
 drivers/infiniband/ulp/ipoib/ipoib_cm.c|   20 ++--
 drivers/infiniband/ulp/ipoib/ipoib_ib.c|   18 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c  |   22 +++---
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   10 +-
 5 files changed, 31 insertions(+), 41 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
b/drivers/infiniband/ulp/ipoib/ipoib.h
index 3a6ef14..1e627ee 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -300,8 +300,6 @@ struct ipoib_dev_priv {
 
struct ib_event_handler event_handler;
 
-   struct net_device_stats stats;
-
struct net_device *parent;
struct list_head child_intfs;
struct list_head list;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 08b4676..1afd93c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -430,7 +430,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct 
ib_wc *wc)
ipoib_dbg(priv, cm recv error 
   (status=%d, wrid=%d vend_err %x)\n,
   wc-status, wr_id, wc-vendor_err);
-   ++priv-stats.rx_dropped;
+   ++dev-stats.rx_dropped;
goto repost;
}
 
@@ -457,7 +457,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct 
ib_wc *wc)
 * this packet and reuse the old buffer.
 */
ipoib_dbg(priv, failed to allocate receive buffer %d\n, 
wr_id);
-   ++priv-stats.rx_dropped;
+   ++dev-stats.rx_dropped;
goto repost;
}
 
@@ -474,8 +474,8 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct 
ib_wc *wc)
skb_pull(skb, IPOIB_ENCAP_LEN);
 
dev-last_rx = jiffies;
-   ++priv-stats.rx_packets;
-   priv-stats.rx_bytes += skb-len;
+   ++dev-stats.rx_packets;
+   dev-stats.rx_bytes += skb-len;
 
skb-dev = dev;
/* XXX get correct PACKET_ type here */
@@ -512,8 +512,8 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
*skb, struct ipoib_cm_
if (unlikely(skb-len  tx-mtu)) {
ipoib_warn(priv, packet len %d ( %d) too long to send, 
dropping\n,
   skb-len, tx-mtu);
-   ++priv-stats.tx_dropped;
-   ++priv-stats.tx_errors;
+   ++dev-stats.tx_dropped;
+   ++dev-stats.tx_errors;
ipoib_cm_skb_too_long(dev, skb, tx-mtu - IPOIB_ENCAP_LEN);
return;
}
@@ -532,7 +532,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
*skb, struct ipoib_cm_
tx_req-skb = skb;
addr = ib_dma_map_single(priv-ca, skb-data, skb-len, DMA_TO_DEVICE);
if (unlikely(ib_dma_mapping_error(priv-ca, addr))) {
-   ++priv-stats.tx_errors;
+   ++dev-stats.tx_errors;
dev_kfree_skb_any(skb);
return;
}
@@ -542,7 +542,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
*skb, struct ipoib_cm_
if (unlikely(post_send(priv, tx, tx-tx_head  (ipoib_sendq_size - 1),
addr, skb-len))) {
ipoib_warn(priv, post_send failed\n);
-   ++priv-stats.tx_errors;
+   ++dev-stats.tx_errors;
ib_dma_unmap_single(priv-ca, addr, skb-len, DMA_TO_DEVICE);
dev_kfree_skb_any(skb);
} else {
@@ -580,8 +580,8 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, 
struct ipoib_cm_tx *tx
ib_dma_unmap_single(priv-ca, tx_req-mapping, tx_req-skb-len, 
DMA_TO_DEVICE);
 
/* FIXME: is this right? Shouldn't we only increment on success? */
-   ++priv-stats.tx_packets;
-   priv-stats.tx_bytes += tx_req-skb-len;
+   ++dev-stats.tx_packets;
+   dev-stats.tx_bytes += tx_req-skb-len;
 
dev_kfree_skb_any(tx_req-skb);
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index b664b98..1a77e79 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -208,7 +208,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, 
struct ib_wc *wc)
 * this packet and reuse the old buffer.
  

Re: [PATCH] IPoIB: Convert to netdevice internal stats

2007-09-28 Thread David Miller
From: Roland Dreier [EMAIL PROTECTED]
Date: Fri, 28 Sep 2007 15:18:01 -0700

 Use the stats member of struct netdevice in IPoIB, so we can save
 memory by deleting the stats member of struct ipoib_dev_priv, and save
 code by deleting ipoib_get_stats().
 
 Signed-off-by: Roland Dreier [EMAIL PROTECTED]

Applied to net-2.6.24, thanks.

How is that ibm_emac NAPI conversion coming along? :-)
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Make TCP prequeue configurable

2007-09-28 Thread David Miller
From: John Heffner [EMAIL PROTECTED]
Date: Thu, 27 Sep 2007 22:26:02 -0400

 I think it really does help in case (4) with old NICs that don't do rx 
 checksumming.  I'm not sure how many people really care about this 
 anymore, but probably some...?
 
 OTOH, it would be nice to get rid of sysctl_tcp_low_latency.

I know most high end apps use poll() so won't sleep in recvmsg()
directly, but occasisionally they will, and even those that have a
poll() triggered recvmsg() will run the backlog and do prequeue if
packets arrive while they are processing the existing receive packets
which is quite common.

So for any app that ends up doing a prequeue it's a win because there
is the issue of scheduling and cpu usage charging.

If the ACK's are coming out of the stack at the rate that the
application can pull data out of the receive queue, and no faster,
this will pace the sender to send precisely how fast the receiver can
get onto the cpu depending upon load.

Furthermore, prequeue puts the stack input processing work into user
context, which means that the users will be charged more fairly for
the work that is done for them.

When packets get fully processed in softirq context, that's bad
because this is cpu usage which doesn't get charged to the user, and
for TCP input processing this cpu usage is non-trivial and is
multiplied by packet count.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


SFQ: backport some features from ESFQ (try 4)

2007-09-28 Thread Corey Hickey

Patchset try 2 addresses the review by Michael Buesch.
Patchset try 3 addresses the review by Patrick McHardy.
Patchset try 4 has a few cosmetic improvements.

Nobody reviewed my last set of patches, and I wasn't pushy about asking.
Since it's been a while, I ported the kernel and userspace patchsets to
current git of net-2.6 and iproute2, respectively.

The first 7 patches in this series resemble the corresponding 7 patches
in patchset try 2. There aren't any major changes--just modifications
to address errors noticed in review and slight reorganizations to make
the next patches easier.

Patches 8-10 implement parameter passing via nested compat attributes.
This is necessary for using 'tc qdisc change' to disable perturbation.
The rest of the parameters were added for consistency.

Iproute2 patches will follow shortly.



The following is the original patch text.

This set of patches adds some of ESFQ's modifications to the original 
SFQ. Thus far, I have received support for this approach rather than for 
trying to get ESFQ included as a separate qdisc.

http://mailman.ds9a.nl/pipermail/lartc/2007q2/021056.html

My patches here implement tc qdisc change, user-configurable depth 
(number of flows), and user-configurable divisor (for setting hash table 
size). I've left out the remaining ESFQ features (usage of jhash and 
different hashing methods) because Patrick McHardy intends to submit a 
patch that will supersede that functionality; see the URL above.

Default values remain the same, and SFQ's default behavior remains the 
same, so there should be no user disruption.

Thanks for your consideration,
Corey



 include/linux/pkt_sched.h |   23 ++--
 net/sched/sch_sfq.c   |  353 ++---
 2 files changed, 254 insertions(+), 122 deletions(-)


[PATCH 01/10] Preparatory refactoring part 1.
[PATCH 02/10] Preparatory refactoring part 2.
[PATCH 03/10] Move two functions.
[PATCH 04/10] Make depth (number of queues) user-configurable:
[PATCH 05/10] Add divisor.
[PATCH 06/10] Make qdisc changeable.
[PATCH 07/10] Remove comments about hardcoded values.
[PATCH 08/10] Multiply perturb_period by HZ when used rather than when assigned.
[PATCH 09/10] Change perturb_period to unsigned.
[PATCH 10/10] Use nested compat attributes to pass parameters.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/10] Move two functions.

2007-09-28 Thread Corey Hickey
Move sfq_q_destroy() to above sfq_q_init() so that it can be used
by an error case in a later patch.

Move sfq_destroy() as well, for clarity.

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 net/sched/sch_sfq.c |   22 +++---
 1 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 1ba3d1a..ca22cb7 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -382,6 +382,17 @@ static void sfq_perturbation(unsigned long arg)
}
 }
 
+static void sfq_q_destroy(struct sfq_sched_data *q)
+{
+   del_timer(q-perturb_timer);
+}
+
+static void sfq_destroy(struct Qdisc *sch)
+{
+   struct sfq_sched_data *q = qdisc_priv(sch);
+   sfq_q_destroy(q);
+}
+
 static void
 sfq_default_parameters(struct Qdisc *sch)
 {
@@ -451,17 +462,6 @@ static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
return 0;
 }
 
-static void sfq_q_destroy(struct sfq_sched_data *q)
-{
-   del_timer(q-perturb_timer);
-}
-
-static void sfq_destroy(struct Qdisc *sch)
-{
-   struct sfq_sched_data *q = qdisc_priv(sch);
-   sfq_q_destroy(q);
-}
-
 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
struct sfq_sched_data *q = qdisc_priv(sch);
-- 
1.5.3

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/10] Preparatory refactoring part 1.

2007-09-28 Thread Corey Hickey
Make a new function sfq_q_enqueue() that operates directly on the
queue data. This will be useful for implementing sfq_change() in
a later patch. A pleasant side-effect is reducing most of the
duplicate code in sfq_enqueue() and sfq_requeue().

Similarly, make a new function sfq_q_dequeue().

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 net/sched/sch_sfq.c |   72 +++
 1 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 3a23e30..57485ef 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -77,6 +77,9 @@
 #define SFQ_DEPTH  128
 #define SFQ_HASH_DIVISOR   1024
 
+#define SFQ_HEAD 0
+#define SFQ_TAIL 1
+
 /* This type should contain at least SFQ_DEPTH*2 values */
 typedef unsigned char sfq_index;
 
@@ -244,10 +247,9 @@ static unsigned int sfq_drop(struct Qdisc *sch)
return 0;
 }
 
-static int
-sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static void
+sfq_q_enqueue(struct sk_buff *skb, struct sfq_sched_data *q, unsigned int end)
 {
-   struct sfq_sched_data *q = qdisc_priv(sch);
unsigned hash = sfq_hash(q, skb);
sfq_index x;
 
@@ -256,8 +258,12 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
q-ht[hash] = x = q-dep[SFQ_DEPTH].next;
q-hash[x] = hash;
}
-   sch-qstats.backlog += skb-len;
-   __skb_queue_tail(q-qs[x], skb);
+
+   if (end == SFQ_TAIL)
+   __skb_queue_tail(q-qs[x], skb);
+   else
+   __skb_queue_head(q-qs[x], skb);
+
sfq_inc(q, x);
if (q-qs[x].qlen == 1) {   /* The flow is new */
if (q-tail == SFQ_DEPTH) { /* It is the first flow */
@@ -270,6 +276,15 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
q-tail = x;
}
}
+}
+
+static int
+sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+   struct sfq_sched_data *q = qdisc_priv(sch);
+
+   sfq_q_enqueue(skb, q, SFQ_TAIL);
+   sch-qstats.backlog += skb-len;
if (++sch-q.qlen = q-limit) {
sch-bstats.bytes += skb-len;
sch-bstats.packets++;
@@ -284,45 +299,21 @@ static int
 sfq_requeue(struct sk_buff *skb, struct Qdisc* sch)
 {
struct sfq_sched_data *q = qdisc_priv(sch);
-   unsigned hash = sfq_hash(q, skb);
-   sfq_index x;
 
-   x = q-ht[hash];
-   if (x == SFQ_DEPTH) {
-   q-ht[hash] = x = q-dep[SFQ_DEPTH].next;
-   q-hash[x] = hash;
-   }
+   sfq_q_enqueue(skb, q, SFQ_HEAD);
sch-qstats.backlog += skb-len;
-   __skb_queue_head(q-qs[x], skb);
-   sfq_inc(q, x);
-   if (q-qs[x].qlen == 1) {   /* The flow is new */
-   if (q-tail == SFQ_DEPTH) { /* It is the first flow */
-   q-tail = x;
-   q-next[x] = x;
-   q-allot[x] = q-quantum;
-   } else {
-   q-next[x] = q-next[q-tail];
-   q-next[q-tail] = x;
-   q-tail = x;
-   }
-   }
if (++sch-q.qlen = q-limit) {
sch-qstats.requeues++;
return 0;
}
 
-   sch-qstats.drops++;
sfq_drop(sch);
return NET_XMIT_CN;
 }
 
-
-
-
-static struct sk_buff *
-sfq_dequeue(struct Qdisc* sch)
+static struct
+sk_buff *sfq_q_dequeue(struct sfq_sched_data *q)
 {
-   struct sfq_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
sfq_index a, old_a;
 
@@ -335,8 +326,6 @@ sfq_dequeue(struct Qdisc* sch)
/* Grab packet */
skb = __skb_dequeue(q-qs[a]);
sfq_dec(q, a);
-   sch-q.qlen--;
-   sch-qstats.backlog -= skb-len;
 
/* Is the slot empty? */
if (q-qs[a].qlen == 0) {
@@ -353,6 +342,21 @@ sfq_dequeue(struct Qdisc* sch)
a = q-next[a];
q-allot[a] += q-quantum;
}
+
+   return skb;
+}
+
+static struct sk_buff
+*sfq_dequeue(struct Qdisc* sch)
+{
+   struct sfq_sched_data *q = qdisc_priv(sch);
+   struct sk_buff *skb;
+
+   skb = sfq_q_dequeue(q);
+   if (skb == NULL)
+   return NULL;
+   sch-q.qlen--;
+   sch-qstats.backlog -= skb-len;
return skb;
 }
 
-- 
1.5.3

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/10] Preparatory refactoring part 2.

2007-09-28 Thread Corey Hickey
Factor code out of sfq_init() and sfq_destroy(), again so that the
new functions can be used by sfq_change() later.

Actually, as the diff itself shows, most of the sfq_q_init() code
comes from the original sfq_change(), but sfq_change() is only
called by sfq_init() right now. Thus, it is safe to remove
sfq_change(); tc qdisc change doesn't yet work for sfq anyway.

Setting default parameters is moved into a separate function for
clarity.

The sfq_destroy() -- sfq_q_destroy() change looks pointless here,
but it's cleaner to split now and add code to sfq_q_destroy() in a
later patch.

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 net/sched/sch_sfq.c |   95 +--
 1 files changed, 54 insertions(+), 41 deletions(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 57485ef..1ba3d1a 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -382,43 +382,41 @@ static void sfq_perturbation(unsigned long arg)
}
 }
 
-static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
+static void
+sfq_default_parameters(struct Qdisc *sch)
 {
struct sfq_sched_data *q = qdisc_priv(sch);
-   struct tc_sfq_qopt *ctl = RTA_DATA(opt);
-   unsigned int qlen;
-
-   if (opt-rta_len  RTA_LENGTH(sizeof(*ctl)))
-   return -EINVAL;
-
-   sch_tree_lock(sch);
-   q-quantum = ctl-quantum ? : psched_mtu(sch-dev);
-   q-perturb_period = ctl-perturb_period*HZ;
-   if (ctl-limit)
-   q-limit = min_t(u32, ctl-limit, SFQ_DEPTH - 2);
 
-   qlen = sch-q.qlen;
-   while (sch-q.qlen  q-limit)
-   sfq_drop(sch);
-   qdisc_tree_decrease_qlen(sch, qlen - sch-q.qlen);
-
-   del_timer(q-perturb_timer);
-   if (q-perturb_period) {
-   q-perturb_timer.expires = jiffies + q-perturb_period;
-   add_timer(q-perturb_timer);
-   }
-   sch_tree_unlock(sch);
-   return 0;
+   q-quantum= psched_mtu(sch-dev);
+   q-perturbation   = 0;
+   q-perturb_period = 0;
+   q-limit  = SFQ_DEPTH - 2;
 }
 
-static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
+static int
+sfq_q_init(struct sfq_sched_data *q, struct rtattr *opt)
 {
-   struct sfq_sched_data *q = qdisc_priv(sch);
int i;
 
-   init_timer(q-perturb_timer);
-   q-perturb_timer.data = (unsigned long)sch;
-   q-perturb_timer.function = sfq_perturbation;
+   /* At this point, parameters are set to either defaults (sfq_init) or
+* the previous values (sfq_change). So, overwrite the parameters as
+* specified. */
+   if (opt) {
+   struct tc_sfq_qopt *ctl = RTA_DATA(opt);
+
+   if (opt-rta_len  RTA_LENGTH(sizeof(*ctl)))
+   return -EINVAL;
+
+   if (ctl-quantum)
+   q-quantum = ctl-quantum;
+   if (ctl-perturb_period)
+   q-perturb_period = ctl-perturb_period * HZ;
+   if (ctl-limit)
+   q-limit = ctl-limit;
+   }
+   q-limit = min_t(u32, q-limit, SFQ_DEPTH - 2);
+   q-tail = SFQ_DEPTH;
+   q-max_depth = 0;
 
for (i=0; iSFQ_HASH_DIVISOR; i++)
q-ht[i] = SFQ_DEPTH;
@@ -427,28 +425,43 @@ static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
q-dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH;
q-dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH;
}
-   q-limit = SFQ_DEPTH - 2;
-   q-max_depth = 0;
-   q-tail = SFQ_DEPTH;
-   if (opt == NULL) {
-   q-quantum = psched_mtu(sch-dev);
-   q-perturb_period = 0;
-   } else {
-   int err = sfq_change(sch, opt);
-   if (err)
-   return err;
-   }
+
for (i=0; iSFQ_DEPTH; i++)
sfq_link(q, i);
return 0;
 }
 
-static void sfq_destroy(struct Qdisc *sch)
+static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
 {
struct sfq_sched_data *q = qdisc_priv(sch);
+   int err;
+
+   sfq_default_parameters(sch);
+   if ((err = sfq_q_init(q, opt)))
+   return err;
+
+   init_timer(q-perturb_timer);
+   q-perturb_timer.data = (unsigned long)sch;
+   q-perturb_timer.function = sfq_perturbation;
+   if (q-perturb_period) {
+   q-perturb_timer.expires = jiffies + q-perturb_period;
+   add_timer(q-perturb_timer);
+   }
+
+   return 0;
+}
+
+static void sfq_q_destroy(struct sfq_sched_data *q)
+{
del_timer(q-perturb_timer);
 }
 
+static void sfq_destroy(struct Qdisc *sch)
+{
+   struct sfq_sched_data *q = qdisc_priv(sch);
+   sfq_q_destroy(q);
+}
+
 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
struct sfq_sched_data *q = qdisc_priv(sch);
-- 
1.5.3

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  

[PATCH 04/10] Make depth (number of queues) user-configurable:

2007-09-28 Thread Corey Hickey
* replace #define with a parameter
* use old hardcoded value as a default
* kcalloc() arrays in sfq_q_init()
* free() arrays in sfq_q_destroy()

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 net/sched/sch_sfq.c |   85 +++---
 1 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index ca22cb7..34a299d 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -74,14 +74,16 @@
 
It is easy to increase these values, but not in flight.  */
 
-#define SFQ_DEPTH  128
+#define SFQ_DEPTH_DEFAULT  128
 #define SFQ_HASH_DIVISOR   1024
 
 #define SFQ_HEAD 0
 #define SFQ_TAIL 1
 
-/* This type should contain at least SFQ_DEPTH*2 values */
-typedef unsigned char sfq_index;
+/* This type must contain greater than depth*2 values, so depth is constrained 
+ * accordingly. */
+typedef unsigned int sfq_index;
+#define SFQ_MAX_DEPTH (UINT_MAX / 2 - 1)
 
 struct sfq_head
 {
@@ -95,6 +97,7 @@ struct sfq_sched_data
int perturb_period;
unsignedquantum;/* Allotment per round: MUST BE = MTU 
*/
int limit;
+   unsigneddepth;
 
 /* Variables */
struct timer_list perturb_timer;
@@ -103,11 +106,11 @@ struct sfq_sched_data
sfq_index   max_depth;  /* Maximal depth */
 
sfq_index   ht[SFQ_HASH_DIVISOR];   /* Hash table */
-   sfq_index   next[SFQ_DEPTH];/* Active slots link */
-   short   allot[SFQ_DEPTH];   /* Current allotment per slot */
-   unsigned short  hash[SFQ_DEPTH];/* Hash value indexed by slots 
*/
-   struct sk_buff_head qs[SFQ_DEPTH];  /* Slot queue */
-   struct sfq_head dep[SFQ_DEPTH*2];   /* Linked list of slots, 
indexed by depth */
+   sfq_index   *next;  /* Active slots link */
+   short   *allot; /* Current allotment per slot */
+   unsigned short  *hash;  /* Hash value indexed by slots 
*/
+   struct sk_buff_head *qs;/* Slot queue */
+   struct sfq_head *dep;   /* Linked list of slots, 
indexed by depth */
 };
 
 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 
h1)
@@ -164,7 +167,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct 
sk_buff *skb)
 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 {
sfq_index p, n;
-   int d = q-qs[x].qlen + SFQ_DEPTH;
+   int d = q-qs[x].qlen + q-depth;
 
p = d;
n = q-dep[d].next;
@@ -215,7 +218,7 @@ static unsigned int sfq_drop(struct Qdisc *sch)
   drop a packet from it */
 
if (d  1) {
-   sfq_index x = q-dep[d+SFQ_DEPTH].next;
+   sfq_index x = q-dep[d + q-depth].next;
skb = q-qs[x].prev;
len = skb-len;
__skb_unlink(skb, q-qs[x]);
@@ -238,7 +241,7 @@ static unsigned int sfq_drop(struct Qdisc *sch)
kfree_skb(skb);
sfq_dec(q, d);
sch-q.qlen--;
-   q-ht[q-hash[d]] = SFQ_DEPTH;
+   q-ht[q-hash[d]] = q-depth;
sch-qstats.drops++;
sch-qstats.backlog -= len;
return len;
@@ -254,8 +257,8 @@ sfq_q_enqueue(struct sk_buff *skb, struct sfq_sched_data 
*q, unsigned int end)
sfq_index x;
 
x = q-ht[hash];
-   if (x == SFQ_DEPTH) {
-   q-ht[hash] = x = q-dep[SFQ_DEPTH].next;
+   if (x == q-depth) {
+   q-ht[hash] = x = q-dep[q-depth].next;
q-hash[x] = hash;
}
 
@@ -266,7 +269,7 @@ sfq_q_enqueue(struct sk_buff *skb, struct sfq_sched_data 
*q, unsigned int end)
 
sfq_inc(q, x);
if (q-qs[x].qlen == 1) {   /* The flow is new */
-   if (q-tail == SFQ_DEPTH) { /* It is the first flow */
+   if (q-tail == q-depth) {  /* It is the first flow */
q-tail = x;
q-next[x] = x;
q-allot[x] = q-quantum;
@@ -318,7 +321,7 @@ sk_buff *sfq_q_dequeue(struct sfq_sched_data *q)
sfq_index a, old_a;
 
/* No active slots */
-   if (q-tail == SFQ_DEPTH)
+   if (q-tail == q-depth)
return NULL;
 
a = old_a = q-next[q-tail];
@@ -329,10 +332,10 @@ sk_buff *sfq_q_dequeue(struct sfq_sched_data *q)
 
/* Is the slot empty? */
if (q-qs[a].qlen == 0) {
-   q-ht[q-hash[a]] = SFQ_DEPTH;
+   q-ht[q-hash[a]] = q-depth;
a = q-next[a];
if (a == old_a) {
-   q-tail = SFQ_DEPTH;
+   q-tail = q-depth;
return skb;
}
q-next[q-tail] = a;
@@ -385,6 +388,11 @@ static void sfq_perturbation(unsigned long arg)
 static void 

[PATCH 06/10] Make qdisc changeable.

2007-09-28 Thread Corey Hickey
Re-implement sfq_change() and enable Qdisc_opts.change so tc qdisc
change will work.

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 net/sched/sch_sfq.c |   61 ++-
 1 files changed, 60 insertions(+), 1 deletions(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index d72ea7c..b8e8fa5 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -416,6 +416,17 @@ sfq_default_parameters(struct Qdisc *sch)
q-limit  = SFQ_DEPTH_DEFAULT - 2;
 }
 
+static void
+sfq_copy_parameters(struct sfq_sched_data *dst, struct sfq_sched_data *src)
+{
+   dst-quantum= src-quantum;
+   dst-perturbation   = src-perturbation;
+   dst-perturb_period = src-perturb_period;
+   dst-hash_divisor   = src-hash_divisor;
+   dst-limit  = src-limit;
+   dst-depth  = src-depth;
+}
+
 static int
 sfq_q_init(struct sfq_sched_data *q, struct rtattr *opt)
 {
@@ -503,6 +514,54 @@ static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
return 0;
 }
 
+static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
+{
+   struct sfq_sched_data *q = qdisc_priv(sch);
+   struct sfq_sched_data tmp;
+   struct sk_buff *skb;
+   unsigned int qlen;
+   int err;
+   
+   /* set up tmp queue */
+   memset(tmp, 0, sizeof(struct sfq_sched_data));
+   sfq_copy_parameters(tmp, q);
+   if ((err = sfq_q_init(tmp, opt)))
+   return err;
+
+   /* copy packets from the old queue to the tmp queue */
+   sch_tree_lock(sch);
+   qlen = sch-q.qlen;
+   while (sch-q.qlen = tmp.limit - 1)
+   sfq_drop(sch);
+   qdisc_tree_decrease_qlen(sch, qlen - sch-q.qlen);
+   while ((skb = sfq_q_dequeue(q)) != NULL)
+   sfq_q_enqueue(skb, tmp, SFQ_TAIL);
+   
+   /* clean up the old queue */
+   sfq_q_destroy(q);
+
+   /* copy elements of the tmp queue into the old queue */
+   sfq_copy_parameters(q, tmp);
+   q-tail  = tmp.tail;
+   q-max_depth = tmp.max_depth;
+   q-ht= tmp.ht;
+   q-dep   = tmp.dep;
+   q-next  = tmp.next;
+   q-allot = tmp.allot;
+   q-hash  = tmp.hash;
+   q-qs= tmp.qs;
+
+   /* finish up */
+   if (q-perturb_period) {
+   q-perturb_timer.expires = jiffies + q-perturb_period;
+   add_timer(q-perturb_timer);
+   } else {
+   q-perturbation = 0;
+   }
+   sch_tree_unlock(sch);
+   return 0;
+}
+
 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
struct sfq_sched_data *q = qdisc_priv(sch);
@@ -537,7 +596,7 @@ static struct Qdisc_ops sfq_qdisc_ops = {
.init   =   sfq_init,
.reset  =   sfq_reset,
.destroy=   sfq_destroy,
-   .change =   NULL,
+   .change =   sfq_change,
.dump   =   sfq_dump,
.owner  =   THIS_MODULE,
 };
-- 
1.5.3

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/10] Multiply perturb_period by HZ when used rather than when assigned.

2007-09-28 Thread Corey Hickey
perturb_period is the only parameter that doesn't match 1:1 with the
value from userspace. This change makes it easy and clean to use a
small macro for setting parameters (in a subsequent patch).

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 net/sched/sch_sfq.c |   10 +-
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 341a9a1..2d3cc38 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -370,7 +370,7 @@ static void sfq_perturbation(unsigned long arg)
q-perturbation = net_random()0x1F;
 
if (q-perturb_period) {
-   q-perturb_timer.expires = jiffies + q-perturb_period;
+   q-perturb_timer.expires = jiffies + q-perturb_period * HZ;
add_timer(q-perturb_timer);
}
 }
@@ -433,7 +433,7 @@ sfq_q_init(struct sfq_sched_data *q, struct rtattr *opt)
if (ctl-quantum)
q-quantum = ctl-quantum;
if (ctl-perturb_period)
-   q-perturb_period = ctl-perturb_period * HZ;
+   q-perturb_period = ctl-perturb_period;
if (ctl-divisor)
q-hash_divisor = ctl-divisor;
if (ctl-flows)
@@ -496,7 +496,7 @@ static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
q-perturb_timer.data = (unsigned long)sch;
q-perturb_timer.function = sfq_perturbation;
if (q-perturb_period) {
-   q-perturb_timer.expires = jiffies + q-perturb_period;
+   q-perturb_timer.expires = jiffies + q-perturb_period * HZ;
add_timer(q-perturb_timer);
}
 
@@ -542,7 +542,7 @@ static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
 
/* finish up */
if (q-perturb_period) {
-   q-perturb_timer.expires = jiffies + q-perturb_period;
+   q-perturb_timer.expires = jiffies + q-perturb_period * HZ;
add_timer(q-perturb_timer);
} else {
q-perturbation = 0;
@@ -558,7 +558,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tc_sfq_qopt opt;
 
opt.quantum = q-quantum;
-   opt.perturb_period = q-perturb_period/HZ;
+   opt.perturb_period = q-perturb_period;
 
opt.limit = q-limit;
opt.divisor = q-hash_divisor;
-- 
1.5.3

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/10] Remove comments about hardcoded values.

2007-09-28 Thread Corey Hickey
None of these are true anymore (hooray!).

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 include/linux/pkt_sched.h |8 
 net/sched/sch_sfq.c   |   17 +++--
 2 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 268c515..58a0ea6 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -148,14 +148,6 @@ struct tc_sfq_qopt
unsignedflows;  /* Maximal number of flows  */
 };
 
-/*
- *  NOTE: limit, divisor and flows are hardwired to code at the moment.
- *
- * limit=flows=128, divisor=1024;
- *
- * The only reason for this is efficiency, it is possible
- * to change these parameters in compile time.
- */
 
 /* RED section */
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index b8e8fa5..341a9a1 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -61,18 +61,7 @@
 
We still need true WFQ for top level CSZ, but using WFQ
for the best effort traffic is absolutely pointless:
-   SFQ is superior for this purpose.
-
-   IMPLEMENTATION:
-   This implementation limits maximal queue length to 128;
-   maximal mtu to 2^15-1; number of hash buckets to 1024.
-   The only goal of this restrictions was that all data
-   fit into one 4K page :-). Struct sfq_sched_data is
-   organized in anti-cache manner: all the data for a bucket
-   are scattered over different locations. This is not good,
-   but it allowed me to put it into 4K.
-
-   It is easy to increase these values, but not in flight.  */
+   SFQ is superior for this purpose. */
 
 #define SFQ_DEPTH_DEFAULT  128
 #define SFQ_DIVISOR_DEFAULT1024
@@ -521,7 +510,7 @@ static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
struct sk_buff *skb;
unsigned int qlen;
int err;
-   
+
/* set up tmp queue */
memset(tmp, 0, sizeof(struct sfq_sched_data));
sfq_copy_parameters(tmp, q);
@@ -536,7 +525,7 @@ static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
qdisc_tree_decrease_qlen(sch, qlen - sch-q.qlen);
while ((skb = sfq_q_dequeue(q)) != NULL)
sfq_q_enqueue(skb, tmp, SFQ_TAIL);
-   
+
/* clean up the old queue */
sfq_q_destroy(q);
 
-- 
1.5.3

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/10] Change perturb_period to unsigned.

2007-09-28 Thread Corey Hickey
perturb_period is currently a signed integer, but I can't see any good
reason why this is so--a negative perturbation period will add a timer
that expires in the past, causing constant perturbation, which makes
hashing useless.

if (q-perturb_period) {
q-perturb_timer.expires = jiffies + q-perturb_period;
add_timer(q-perturb_timer);
}

Strictly speaking, this will break binary compatibility with older
versions of tc, but that ought not to be a problem because (a) there's
no valid use for a negative perturb_period, and (b) negative values
will be seen as high values ( INT_MAX), which don't work anyway.

If perturb_period is too large, (perturb_period * HZ) will overflow the
size of an unsigned int and wrap around. So, check for thet and reject
values that are too high.

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 include/linux/pkt_sched.h |2 +-
 net/sched/sch_sfq.c   |8 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 58a0ea6..8559974 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -142,7 +142,7 @@ enum
 struct tc_sfq_qopt
 {
unsignedquantum;/* Bytes per round allocated to flow */
-   int perturb_period; /* Period of hash perturbation */
+   unsignedperturb_period; /* Period of hash perturbation */
__u32   limit;  /* Maximal packets in queue */
unsigneddivisor;/* Hash divisor  */
unsignedflows;  /* Maximal number of flows  */
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 2d3cc38..170fd37 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -74,6 +74,9 @@
 typedef unsigned int sfq_index;
 #define SFQ_MAX_DEPTH (UINT_MAX / 2 - 1)
 
+/* We don't want perturb_period * HZ to overflow an unsigned int. */
+#define SFQ_MAX_PERTURB (UINT_MAX / HZ)
+
 struct sfq_head
 {
sfq_index   next;
@@ -83,7 +86,7 @@ struct sfq_head
 struct sfq_sched_data
 {
 /* Parameters */
-   int perturb_period;
+   unsignedperturb_period;
unsignedquantum;/* Allotment per round: MUST BE = MTU 
*/
int limit;
unsigneddepth;
@@ -441,7 +444,8 @@ sfq_q_init(struct sfq_sched_data *q, struct rtattr *opt)
if (ctl-limit)
q-limit = ctl-limit;
 
-   if (q-depth  SFQ_MAX_DEPTH)
+   if (q-perturb_period  SFQ_MAX_PERTURB ||
+   q-depth  SFQ_MAX_DEPTH)
return -EINVAL;
}
q-limit = min_t(u32, q-limit, q-depth - 2);
-- 
1.5.3

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/10] Add divisor.

2007-09-28 Thread Corey Hickey
Make hash divisor user-configurable.

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 net/sched/sch_sfq.c |   18 +-
 1 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 34a299d..d72ea7c 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -75,7 +75,7 @@
It is easy to increase these values, but not in flight.  */
 
 #define SFQ_DEPTH_DEFAULT  128
-#define SFQ_HASH_DIVISOR   1024
+#define SFQ_DIVISOR_DEFAULT1024
 
 #define SFQ_HEAD 0
 #define SFQ_TAIL 1
@@ -98,6 +98,7 @@ struct sfq_sched_data
unsignedquantum;/* Allotment per round: MUST BE = MTU 
*/
int limit;
unsigneddepth;
+   unsignedhash_divisor;
 
 /* Variables */
struct timer_list perturb_timer;
@@ -105,7 +106,7 @@ struct sfq_sched_data
sfq_index   tail;   /* Index of current slot in round */
sfq_index   max_depth;  /* Maximal depth */
 
-   sfq_index   ht[SFQ_HASH_DIVISOR];   /* Hash table */
+   sfq_index   *ht;/* Hash table */
sfq_index   *next;  /* Active slots link */
short   *allot; /* Current allotment per slot */
unsigned short  *hash;  /* Hash value indexed by slots 
*/
@@ -120,7 +121,7 @@ static __inline__ unsigned sfq_fold_hash(struct 
sfq_sched_data *q, u32 h, u32 h1
/* Have we any rotation primitives? If not, WHY? */
h ^= (h1pert) ^ (h1(0x1F - pert));
h ^= h10;
-   return h  0x3FF;
+   return h  (q-hash_divisor-1);
 }
 
 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
@@ -388,6 +389,7 @@ static void sfq_perturbation(unsigned long arg)
 static void sfq_q_destroy(struct sfq_sched_data *q)
 {
del_timer(q-perturb_timer);
+   kfree(q-ht);
kfree(q-dep);
kfree(q-next);
kfree(q-allot);
@@ -409,6 +411,7 @@ sfq_default_parameters(struct Qdisc *sch)
q-quantum= psched_mtu(sch-dev);
q-perturbation   = 0;
q-perturb_period = 0;
+   q-hash_divisor   = SFQ_DIVISOR_DEFAULT;
q-depth  = SFQ_DEPTH_DEFAULT;
q-limit  = SFQ_DEPTH_DEFAULT - 2;
 }
@@ -431,6 +434,8 @@ sfq_q_init(struct sfq_sched_data *q, struct rtattr *opt)
q-quantum = ctl-quantum;
if (ctl-perturb_period)
q-perturb_period = ctl-perturb_period * HZ;
+   if (ctl-divisor)
+   q-hash_divisor = ctl-divisor;
if (ctl-flows)
q-depth = ctl-flows;
if (ctl-limit)
@@ -443,6 +448,9 @@ sfq_q_init(struct sfq_sched_data *q, struct rtattr *opt)
q-tail = q-depth;
q-max_depth = 0;
 
+   q-ht = kcalloc(q-hash_divisor, sizeof(sfq_index), GFP_KERNEL);
+   if (!q-ht)
+   goto err_case;
q-dep = kcalloc(1 + q-depth*2, sizeof(struct sfq_head), GFP_KERNEL);
if (!q-dep)
goto err_case;
@@ -459,7 +467,7 @@ sfq_q_init(struct sfq_sched_data *q, struct rtattr *opt)
if (!q-qs)
goto err_case;
 
-   for (i=0; iSFQ_HASH_DIVISOR; i++)
+   for (i=0; iq-hash_divisor; i++)
q-ht[i] = q-depth;
for (i=0; i  q-depth; i++) {
skb_queue_head_init(q-qs[i]);
@@ -505,7 +513,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.perturb_period = q-perturb_period/HZ;
 
opt.limit = q-limit;
-   opt.divisor = SFQ_HASH_DIVISOR;
+   opt.divisor = q-hash_divisor;
opt.flows = q-depth;
 
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), opt);
-- 
1.5.3

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/10] Use nested compat attributes to pass parameters.

2007-09-28 Thread Corey Hickey
This fixes the ambiguity between, for example:
tc qdisc change ... perturb 0
tc qdisc change ...

Without this patch, there is no way for SFQ to differentiate between
a parameter specified to be 0 and a parameter that was omitted.

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 include/linux/pkt_sched.h |   13 +++
 net/sched/sch_sfq.c   |   53 +---
 2 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 8559974..aad04eb 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -148,6 +148,19 @@ struct tc_sfq_qopt
unsignedflows;  /* Maximal number of flows  */
 };
 
+enum
+{
+   TCA_SFQ_UNSPEC,
+   TCA_SFQ_COMPAT,
+   TCA_SFQ_QUANTUM,
+   TCA_SFQ_PERTURB,
+   TCA_SFQ_LIMIT,
+   TCA_SFQ_DIVISOR,
+   TCA_SFQ_FLOWS,
+   __TCA_SFQ_MAX,
+};
+
+#define TCA_SFQ_MAX (__TCA_SFQ_MAX - 1)
 
 /* RED section */
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 170fd37..36197f6 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -428,25 +428,31 @@ sfq_q_init(struct sfq_sched_data *q, struct rtattr *opt)
 * the previous values (sfq_change). So, overwrite the parameters as
 * specified. */
if (opt) {
-   struct tc_sfq_qopt *ctl = RTA_DATA(opt);
-
-   if (opt-rta_len  RTA_LENGTH(sizeof(*ctl)))
-   return -EINVAL;
-
-   if (ctl-quantum)
-   q-quantum = ctl-quantum;
-   if (ctl-perturb_period)
-   q-perturb_period = ctl-perturb_period;
-   if (ctl-divisor)
-   q-hash_divisor = ctl-divisor;
-   if (ctl-flows)
-   q-depth = ctl-flows;
-   if (ctl-limit)
-   q-limit = ctl-limit;
-
+   struct tc_sfq_qopt *ctl;
+   struct rtattr *tb[TCA_SFQ_MAX];
+
+   if (rtattr_parse_nested_compat(tb, TCA_SFQ_MAX, opt, ctl,
+  sizeof(*ctl)))
+   goto rtattr_failure;
+
+#define GET_PARAM(dst, nest, compat) do { \
+   struct rtattr *rta = tb[(nest) - 1]; \
+   if (rta) \
+   (dst) = RTA_GET_U32(rta); \
+   else if ((compat)) \
+   (dst) = (compat); \
+} while (0)
+
+   GET_PARAM(q-quantum,TCA_SFQ_QUANTUM, ctl-quantum);
+   GET_PARAM(q-perturb_period, TCA_SFQ_PERTURB,
+   ctl-perturb_period);
+   GET_PARAM(q-hash_divisor,   TCA_SFQ_DIVISOR, ctl-divisor);
+   GET_PARAM(q-depth,  TCA_SFQ_FLOWS,   ctl-flows);
+   GET_PARAM(q-limit,  TCA_SFQ_LIMIT,   ctl-limit);
+   
if (q-perturb_period  SFQ_MAX_PERTURB ||
q-depth  SFQ_MAX_DEPTH)
-   return -EINVAL;
+   goto rtattr_failure;
}
q-limit = min_t(u32, q-limit, q-depth - 2);
q-tail = q-depth;
@@ -482,6 +488,8 @@ sfq_q_init(struct sfq_sched_data *q, struct rtattr *opt)
for (i=0; i  q-depth; i++)
sfq_link(q, i);
return 0;
+rtattr_failure:
+   return -EINVAL;
 err_case:
sfq_q_destroy(q);
return -ENOBUFS;
@@ -559,17 +567,26 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff 
*skb)
 {
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
+   struct rtattr *nest;
struct tc_sfq_qopt opt;
 
opt.quantum = q-quantum;
opt.perturb_period = q-perturb_period;
-
opt.limit = q-limit;
opt.divisor = q-hash_divisor;
opt.flows = q-depth;
 
+   nest = RTA_NEST_COMPAT(skb, TCA_OPTIONS, sizeof(opt), opt);
+
+   RTA_PUT_U32(skb, TCA_SFQ_QUANTUM, q-quantum);
+   RTA_PUT_U32(skb, TCA_SFQ_PERTURB, q-perturb_period);
+   RTA_PUT_U32(skb, TCA_SFQ_LIMIT,   q-limit);
+   RTA_PUT_U32(skb, TCA_SFQ_DIVISOR, q-hash_divisor);
+   RTA_PUT_U32(skb, TCA_SFQ_FLOWS,   q-depth);
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), opt);
 
+   RTA_NEST_COMPAT_END(skb, nest);
+
return skb-len;
 
 rtattr_failure:
-- 
1.5.3

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


sfq

2007-09-28 Thread Corey Hickey

These patches follow the ESFQ--SFQ kernel patches. See the kernel
patch summary for general information.

Thanks,
Corey



 include/linux/pkt_sched.h |   23 ++-
 tc/q_sfq.c|   43 ++-
 2 files changed, 52 insertions(+), 14 deletions(-)


[PATCH 1/3] SFQ: Support changing depth and divisor.
[PATCH 2/3] Change perturb_period to unsigned.
[PATCH 3/3] Use nested compat attributes for passing parameters to the kernel.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] Change perturb_period to unsigned.

2007-09-28 Thread Corey Hickey
This corresponds to the kernel patch doing the same.

Here, too, this will technically break binary compatibility with older
kernels, but that shouldn't be a problem because negative perturb_period
values aren't usable anyway.

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 include/linux/pkt_sched.h |2 +-
 tc/q_sfq.c|4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 9d41f63..fb04a89 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -142,7 +142,7 @@ enum
 struct tc_sfq_qopt
 {
unsignedquantum;/* Bytes per round allocated to flow */
-   int perturb_period; /* Period of hash perturbation */
+   unsignedperturb_period; /* Period of hash perturbation */
__u32   limit;  /* Maximal packets in queue */
unsigneddivisor;/* Hash divisor  */
unsignedflows;  /* Maximal number of flows  */
diff --git a/tc/q_sfq.c b/tc/q_sfq.c
index 7754db7..c9fcc53 100644
--- a/tc/q_sfq.c
+++ b/tc/q_sfq.c
@@ -47,7 +47,7 @@ static int sfq_parse_opt(struct qdisc_util *qu, int argc, 
char **argv, struct nl
ok++;
} else if (strcmp(*argv, perturb) == 0) {
NEXT_ARG();
-   if (get_integer(opt.perturb_period, *argv, 0)) {
+   if (get_u32(opt.perturb_period, *argv, 0)) {
fprintf(stderr, Illegal \perturb\\n);
return -1;
}
@@ -115,7 +115,7 @@ static int sfq_print_opt(struct qdisc_util *qu, FILE *f, 
struct rtattr *opt)
fprintf(f, flows %u/%u , qopt-flows, qopt-divisor);
}
if (qopt-perturb_period)
-   fprintf(f, perturb %dsec , qopt-perturb_period);
+   fprintf(f, perturb %usec , qopt-perturb_period);
return 0;
 }
 
-- 
1.5.3

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] Use nested compat attributes for passing parameters to the kernel.

2007-09-28 Thread Corey Hickey
Note that I have left sfq_print_opt() alone. At this point, there
can be no difference between the data in the nested rtattrs and the
data in the compat rtattr, and I didn't want to add clutter that
isn't useful. Let me know if I should do differently.

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 include/linux/pkt_sched.h |   14 ++
 tc/q_sfq.c|   18 --
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index fb04a89..aad04eb 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -148,6 +148,20 @@ struct tc_sfq_qopt
unsignedflows;  /* Maximal number of flows  */
 };
 
+enum
+{
+   TCA_SFQ_UNSPEC,
+   TCA_SFQ_COMPAT,
+   TCA_SFQ_QUANTUM,
+   TCA_SFQ_PERTURB,
+   TCA_SFQ_LIMIT,
+   TCA_SFQ_DIVISOR,
+   TCA_SFQ_FLOWS,
+   __TCA_SFQ_MAX,
+};
+
+#define TCA_SFQ_MAX (__TCA_SFQ_MAX - 1)
+
 /* RED section */
 
 enum
diff --git a/tc/q_sfq.c b/tc/q_sfq.c
index c9fcc53..5bb3eb7 100644
--- a/tc/q_sfq.c
+++ b/tc/q_sfq.c
@@ -34,9 +34,13 @@ static int sfq_parse_opt(struct qdisc_util *qu, int argc, 
char **argv, struct nl
 {
int ok=0;
struct tc_sfq_qopt opt;
+   struct rtattr *nest;
 
memset(opt, 0, sizeof(opt));
 
+   /* put blank data in rtattr so there is a hole to fill later */
+   nest = addattr_nest_compat(n, 1024, TCA_OPTIONS, opt, sizeof(opt));
+
while (argc  0) {
if (strcmp(*argv, quantum) == 0) {
NEXT_ARG();
@@ -44,6 +48,7 @@ static int sfq_parse_opt(struct qdisc_util *qu, int argc, 
char **argv, struct nl
fprintf(stderr, Illegal \limit\\n);
return -1;
}
+   addattr32(n, 1024, TCA_SFQ_QUANTUM, opt.quantum);
ok++;
} else if (strcmp(*argv, perturb) == 0) {
NEXT_ARG();
@@ -51,6 +56,7 @@ static int sfq_parse_opt(struct qdisc_util *qu, int argc, 
char **argv, struct nl
fprintf(stderr, Illegal \perturb\\n);
return -1;
}
+   addattr32(n, 1024, TCA_SFQ_PERTURB, opt.perturb_period);
ok++;
} else if (strcmp(*argv, limit) == 0) {
NEXT_ARG();
@@ -62,6 +68,7 @@ static int sfq_parse_opt(struct qdisc_util *qu, int argc, 
char **argv, struct nl
fprintf(stderr, Illegal \limit\, must be  
1\n);
return -1;
}
+   addattr32(n, 1024, TCA_SFQ_LIMIT, opt.limit);
ok++;
} else if (strcmp(*argv, depth) == 0) {
NEXT_ARG();
@@ -69,6 +76,7 @@ static int sfq_parse_opt(struct qdisc_util *qu, int argc, 
char **argv, struct nl
fprintf(stderr, Illegal \depth\\n);
return -1;
}
+   addattr32(n, 1024, TCA_SFQ_FLOWS, opt.flows);
ok++;
} else if (strcmp(*argv, divisor) == 0) {
NEXT_ARG();
@@ -81,6 +89,7 @@ static int sfq_parse_opt(struct qdisc_util *qu, int argc, 
char **argv, struct nl
return -1;
}
opt.divisor = 1opt.divisor;
+   addattr32(n, 1024, TCA_SFQ_DIVISOR, opt.divisor);
ok++;
} else if (strcmp(*argv, help) == 0) {
explain();
@@ -93,8 +102,13 @@ static int sfq_parse_opt(struct qdisc_util *qu, int argc, 
char **argv, struct nl
argc--; argv++;
}
 
-   if (ok)
-   addattr_l(n, 1024, TCA_OPTIONS, opt, sizeof(opt));
+   if (ok) {
+   /* fill the hole we left earlier with real compat data */
+   memcpy(RTA_DATA(nest), opt, sizeof(opt));
+   addattr_nest_compat_end(n, nest);
+   }
+   else
+   nest-rta_len = 0;
return 0;
 }
 
-- 
1.5.3

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] SFQ: Support changing depth and divisor.

2007-09-28 Thread Corey Hickey
This can safely be applied either before or after the kernel
patches because the tc_sfq_qopt struct is unchanged:

- old kernels will ignore the parameters from new iproute2
- new kernels will use the same default parameters

Signed-off-by: Corey Hickey [EMAIL PROTECTED]
---
 include/linux/pkt_sched.h |9 -
 tc/q_sfq.c|   21 -
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 268c515..9d41f63 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -148,15 +148,6 @@ struct tc_sfq_qopt
unsignedflows;  /* Maximal number of flows  */
 };
 
-/*
- *  NOTE: limit, divisor and flows are hardwired to code at the moment.
- *
- * limit=flows=128, divisor=1024;
- *
- * The only reason for this is efficiency, it is possible
- * to change these parameters in compile time.
- */
-
 /* RED section */
 
 enum
diff --git a/tc/q_sfq.c b/tc/q_sfq.c
index 05385cf..7754db7 100644
--- a/tc/q_sfq.c
+++ b/tc/q_sfq.c
@@ -25,7 +25,7 @@
 
 static void explain(void)
 {
-   fprintf(stderr, Usage: ... sfq [ limit NUMBER ] [ perturb SECS ] [ 
quantum BYTES ]\n);
+   fprintf(stderr, Usage: ... sfq [ limit NUMBER ] [ depth FLOWS ] [ 
divisor HASHBITS ] [ perturb SECS ] [ quantum BYTES ]\n);
 }
 
 #define usage() return(-1)
@@ -63,6 +63,25 @@ static int sfq_parse_opt(struct qdisc_util *qu, int argc, 
char **argv, struct nl
return -1;
}
ok++;
+   } else if (strcmp(*argv, depth) == 0) {
+   NEXT_ARG();
+   if (get_unsigned(opt.flows, *argv, 0)) {
+   fprintf(stderr, Illegal \depth\\n);
+   return -1;
+   }
+   ok++;
+   } else if (strcmp(*argv, divisor) == 0) {
+   NEXT_ARG();
+   if (get_unsigned(opt.divisor, *argv, 0)) {
+   fprintf(stderr, Illegal \divisor\\n);
+   return -1;
+   }
+   if (opt.divisor = 15) {
+   fprintf(stderr, Illegal \divisor\, must be  
15\n);
+   return -1;
+   }
+   opt.divisor = 1opt.divisor;
+   ok++;
} else if (strcmp(*argv, help) == 0) {
explain();
return -1;
-- 
1.5.3

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: sfq (iproute2 patches)

2007-09-28 Thread Corey Hickey

Corey Hickey wrote:

These patches follow the ESFQ--SFQ kernel patches. See the kernel
patch summary for general information.


Dang, I forgot to set the subject; these are the iproute2 patches.

-Corey
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Null dereference in socket.c

2007-09-28 Thread Chuck Ebbert
After debugging an oops (https://bugzilla.redhat.com/attachment.cgi?id=209231)
I find it happens here in socket.c::sock_ioctl() line 902:

   default:
=  err = sock-ops-ioctl(sock, cmd, arg);

/*
 * If this ioctl is unknown try to hand it down
 * to the NIC driver.
 */
if (err == -ENOIOCTLCMD)
err = dev_ioctl(cmd, argp);
break;


ioctl is NULL and the kernel jumps to address 0. Should we add a check
for that?

Bug report:
https://bugzilla.redhat.com/show_bug.cgi?id=306801
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IPoIB: Convert to netdevice internal stats

2007-09-28 Thread Roland Dreier
  How is that ibm_emac NAPI conversion coming along? :-)

Sorry, trying to reduce my backlog first, but it is still on my list
of things to work on :)

 - R.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Null dereference in socket.c

2007-09-28 Thread David Miller
From: Chuck Ebbert [EMAIL PROTECTED]
Date: Fri, 28 Sep 2007 18:58:36 -0400

 After debugging an oops (https://bugzilla.redhat.com/attachment.cgi?id=209231)
 I find it happens here in socket.c::sock_ioctl() line 902:
 
default:
 =  err = sock-ops-ioctl(sock, cmd, arg);
 
 /*
  * If this ioctl is unknown try to hand it down
  * to the NIC driver.
  */
 if (err == -ENOIOCTLCMD)
 err = dev_ioctl(cmd, argp);
 break;
 
 
 ioctl is NULL and the kernel jumps to address 0. Should we add a check
 for that?
 
 Bug report:
 https://bugzilla.redhat.com/show_bug.cgi?id=306801

Every protocol should provide a non-NULL -ioctl() method,
find out which one isn't and fix it.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Update get_net_ns_by_pid

2007-09-28 Thread Serge E. Hallyn
Quoting Eric W. Biederman ([EMAIL PROTECTED]):
 
 In the -mm tree the rules for access an nsproxy have changed,
 and in get_net_ns_by_pid we access the nsproxy, so update
 it to follow the new rules.
 
 Signed-off-by: Eric W. Biederman [EMAIL PROTECTED]

Yup, looks right.

I assume Pavel's Acked-by would actually matter, but still

Acked-by: Serge Hallyn [EMAIL PROTECTED]

thanks,
-serge

 ---
 
 diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
 index 739fbad..1caba10 100644
 --- a/net/core/rtnetlink.c
 +++ b/net/core/rtnetlink.c
 @@ -746,10 +746,10 @@ static struct net *get_net_ns_by_pid(pid_t pid)
   rcu_read_lock();
   tsk = find_task_by_pid(pid);
   if (tsk) {
 - task_lock(tsk);
 - if (tsk-nsproxy)
 - net = get_net(tsk-nsproxy-net_ns);
 - task_unlock(tsk);
 + struct nsproxy *nsproxy;
 + nsproxy = task_nsproxy(tsk);
 + if (nsproxy)
 + net = get_net(nsproxy-net_ns);
   }
   rcu_read_unlock();
   return net;
 ___
 Containers mailing list
 [EMAIL PROTECTED]
 https://lists.linux-foundation.org/mailman/listinfo/containers
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Null dereference in socket.c

2007-09-28 Thread Stephen Hemminger
On Fri, 28 Sep 2007 16:00:57 -0700 (PDT)
David Miller [EMAIL PROTECTED] wrote:

 From: Chuck Ebbert [EMAIL PROTECTED]
 Date: Fri, 28 Sep 2007 18:58:36 -0400
 
  After debugging an oops 
  (https://bugzilla.redhat.com/attachment.cgi?id=209231)
  I find it happens here in socket.c::sock_ioctl() line 902:
  
 default:
  =  err = sock-ops-ioctl(sock, cmd, arg);
  
  /*
   * If this ioctl is unknown try to hand it down
   * to the NIC driver.
   */
  if (err == -ENOIOCTLCMD)
  err = dev_ioctl(cmd, argp);
  break;
  
  
  ioctl is NULL and the kernel jumps to address 0. Should we add a check
  for that?
  
  Bug report:
  https://bugzilla.redhat.com/show_bug.cgi?id=306801
 
 Every protocol should provide a non-NULL -ioctl() method,
 find out which one isn't and fix it

Auditing the net-2.6.24 tree all instances found by cscope are safe.




-- 
Stephen Hemminger [EMAIL PROTECTED]
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [IPV6] Fix ICMPv6 redirect handling with target multicast address

2007-09-28 Thread David Stevens
Brian,
A multicast address should never be the target of a neighbor
discovery request; the sender should use the mapping function for all
multicasts. So, I'm not sure that your example can ever happen, and it
certainly is ok to send ICMPv6 errors to multicast addresses in general.
But I don't see that it hurts anything. either (since it should never 
happen :-)),
so I don't particularly object, either.
I think it'd also be better if you add the check to be:

if (ipv6_addr_type(target)  
(IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST))

or something along those lines, rather than reproducing ipv6_addr_type() 
code
separately in a new ipv6_addr_linklocal() function.

+-DLS


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PKT_SCHED]: Add stateless NAT

2007-09-28 Thread Herbert Xu
On Fri, Sep 28, 2007 at 06:55:32PM +0200, Patrick McHardy wrote:

 Looking at ip_input.o as example (everything without forced inlining):
 
textdata bss dec hex filename
2076   8   02084 824 net/ipv4/ip_input.o
3483   8   03491 da3 net/ipv4/ip_input.o

If it's so big perhaps it should be inlined? It'll be a tail
call anyway.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


delayed acks question

2007-09-28 Thread Andrea Arcangeli
Hello,

I've a few questions about ICSK_ACK_PUSHED2.

PUSHED2 is only meant to force the ack out immediately when pingpong
is set to 1, but then if pingpong is 1 the delayed acks shouldn't be
deferred anyway. However I think the trouble is that there's a race
condition in reading pingpong, pingpong is only valid for the receiver
tcp context, not for the userland code reading the receive buffer. By
the time userland reads the receive buffer the other context may have
changed.

Is PUSHED2 ever cleared? Is PUSHED ever cleared in the first place?

Why can't PUSHED2 be deleted together with the pingpong check, making
PUSHED enough to guarantee to send the ack out once the receive buffer
is empty.

We've seen hangs of 40msec during slow start caused by the delayed
acks that PUSHED2 seems to fix, but while reviewing it I can't see how
could it possibly make sense to have a pingpong check in the userland
side when pingpong will change all the time anytime (making
TCP_QUICKACK a funny joke too).

Thanks.

PS. I'm not subscribed to netdev, I looked into this incidentally, so
please include me in the CC.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] rtnl: Simplify ASSERT_RTNL

2007-09-28 Thread Eric W. Biederman

Currently we have the call path:
macvlan_open - dev_unicast_add - __dev_set_rx_mode -
__dev_set_promiscuity - ASSERT_RTNL - mutex_trylock

When mutex debugging is on taking a mutex complains if we are not
allowed to sleep.  At that point we have called netif_tx_lock_bh
so we are clearly not allowed to sleep.  Arguably this is not a
problem for mutex_trylock.

However we can avoid the complaint and make the ASSERT_RTNL code
cheaper, faster and more obvious by simply calling mutex_is_locked.

So this patch adds rtnl_is_locked (which does mutex_is_locked on
the rtnl_mutex) and changes ASSERT_RTNL to use that.

Signed-off-by: Eric W. Biederman [EMAIL PROTECTED]
---
 include/linux/rtnetlink.h |4 ++--
 net/core/rtnetlink.c  |5 +
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index dff3192..9c21e45 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -714,13 +714,13 @@ extern void rtmsg_ifinfo(int type, struct net_device 
*dev, unsigned change);
 extern void rtnl_lock(void);
 extern void rtnl_unlock(void);
 extern int rtnl_trylock(void);
+extern int rtnl_is_locked(void);
 
 extern void rtnetlink_init(void);
 extern void __rtnl_unlock(void);
 
 #define ASSERT_RTNL() do { \
-   if (unlikely(rtnl_trylock())) { \
-   rtnl_unlock(); \
+   if (unlikely(!rtnl_is_locked())) { \
printk(KERN_ERR RTNL: assertion failed at %s (%d)\n, \
   __FILE__,  __LINE__); \
dump_stack(); \
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 739fbad..8bc68e6 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -85,6 +85,11 @@ int rtnl_trylock(void)
return mutex_trylock(rtnl_mutex);
 }
 
+int rtnl_is_locked(void)
+{
+   return mutex_is_locked(rtnl_mutex);
+}
+
 int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len)
 {
memset(tb, 0, sizeof(struct rtattr*)*maxattr);
-- 
1.5.3.rc6.17.g1911

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] net: Modify all rtnetlink methods to only work in the initial namespace

2007-09-28 Thread Eric W. Biederman

Before I can enable rtnetlink to work in all network namespaces
I need to be certain that something won't break.  So this
patch deliberately disables all of the rtnletlink methods in everything
except the initial network namespace.  After the methods have been
audited this extra check can be disabled.

Signed-off-by: Eric W. Biederman [EMAIL PROTECTED]
---
 net/bridge/br_netlink.c |9 +
 net/core/fib_rules.c|   11 +++
 net/core/neighbour.c|   18 ++
 net/core/rtnetlink.c|   19 +++
 net/decnet/dn_dev.c |   12 
 net/decnet/dn_fib.c |8 
 net/decnet/dn_route.c   |8 
 net/decnet/dn_table.c   |4 
 net/ipv4/devinet.c  |   12 
 net/ipv4/fib_frontend.c |   12 
 net/ipv4/route.c|4 
 net/ipv6/addrconf.c |   31 +++
 net/ipv6/ip6_fib.c  |4 
 net/ipv6/route.c|   12 
 net/sched/act_api.c |   10 ++
 net/sched/cls_api.c |   10 ++
 net/sched/sch_api.c |   21 +
 17 files changed, 205 insertions(+), 0 deletions(-)

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 53ab8e0..a4ffa2b 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -13,6 +13,7 @@
 #include linux/kernel.h
 #include net/rtnetlink.h
 #include net/net_namespace.h
+#include net/sock.h
 #include br_private.h
 
 static inline size_t br_nlmsg_size(void)
@@ -107,9 +108,13 @@ errout:
  */
 static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 {
+   struct net *net = skb-sk-sk_net;
struct net_device *dev;
int idx;
 
+   if (net != init_net)
+   return 0;
+
idx = 0;
for_each_netdev(init_net, dev) {
/* not a bridge port */
@@ -135,12 +140,16 @@ skip:
  */
 static int br_rtm_setlink(struct sk_buff *skb,  struct nlmsghdr *nlh, void 
*arg)
 {
+   struct net *net = skb-sk-sk_net;
struct ifinfomsg *ifm;
struct nlattr *protinfo;
struct net_device *dev;
struct net_bridge_port *p;
u8 new_state;
 
+   if (net != init_net)
+   return -EINVAL;
+
if (nlmsg_len(nlh)  sizeof(*ifm))
return -EINVAL;
 
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 13de6f5..357bfa0 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -206,6 +206,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct 
nlmsghdr* nlh, void *arg)
struct nlattr *tb[FRA_MAX+1];
int err = -EINVAL, unresolved = 0;
 
+   if (net != init_net)
+   return -EINVAL;
+
if (nlh-nlmsg_len  nlmsg_msg_size(sizeof(*frh)))
goto errout;
 
@@ -336,12 +339,16 @@ errout:
 
 static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
+   struct net *net = skb-sk-sk_net;
struct fib_rule_hdr *frh = nlmsg_data(nlh);
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *tmp;
struct nlattr *tb[FRA_MAX+1];
int err = -EINVAL;
 
+   if (net != init_net)
+   return -EINVAL;
+
if (nlh-nlmsg_len  nlmsg_msg_size(sizeof(*frh)))
goto errout;
 
@@ -517,9 +524,13 @@ skip:
 
 static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
 {
+   struct net *net = skb-sk-sk_net;
struct fib_rules_ops *ops;
int idx = 0, family;
 
+   if (net != init_net)
+   return -EINVAL;
+
family = rtnl_msg_family(cb-nlh);
if (family != AF_UNSPEC) {
/* Protocol specific dump request */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index c52df85..27001db 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1448,6 +1448,9 @@ static int neigh_delete(struct sk_buff *skb, struct 
nlmsghdr *nlh, void *arg)
struct net_device *dev = NULL;
int err = -EINVAL;
 
+   if (net != init_net)
+   return -EINVAL;
+
if (nlmsg_len(nlh)  sizeof(*ndm))
goto out;
 
@@ -1514,6 +1517,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr 
*nlh, void *arg)
struct net_device *dev = NULL;
int err;
 
+   if (net != init_net)
+   return -EINVAL;
+
err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
if (err  0)
goto out;
@@ -1788,11 +1794,15 @@ static const struct nla_policy 
nl_ntbl_parm_policy[NDTPA_MAX+1] = {
 
 static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 {
+   struct net *net = skb-sk-sk_net;
struct neigh_table *tbl;
struct ndtmsg *ndtmsg;
struct nlattr *tb[NDTA_MAX+1];
int err;
 
+   if (net != init_net)
+   return -EINVAL;
+
err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
  

[PATCH 2/5] net: Make rtnetlink infrastructure network namespace aware

2007-09-28 Thread Eric W. Biederman

After this patch none of the netlink callback support anything
except the initial network namespace but the rtnetlink infrastructure
now handles multiple network namespaces.

Signed-off-by: Eric W. Biederman [EMAIL PROTECTED]
---
 include/linux/rtnetlink.h   |8 ++--
 include/net/net_namespace.h |3 +
 net/bridge/br_netlink.c |4 +-
 net/core/fib_rules.c|4 +-
 net/core/neighbour.c|4 +-
 net/core/rtnetlink.c|   96 +--
 net/decnet/dn_dev.c |4 +-
 net/decnet/dn_route.c   |2 +-
 net/decnet/dn_table.c   |4 +-
 net/ipv4/devinet.c  |4 +-
 net/ipv4/fib_semantics.c|4 +-
 net/ipv4/ipmr.c |4 +-
 net/ipv4/route.c|2 +-
 net/ipv6/addrconf.c |   14 +++---
 net/ipv6/route.c|6 +-
 net/sched/act_api.c |8 ++--
 net/sched/cls_api.c |2 +-
 net/sched/sch_api.c |4 +-
 net/wireless/wext.c |5 ++-
 19 files changed, 129 insertions(+), 53 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 9c21e45..518247e 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -582,11 +582,11 @@ extern int __rtattr_parse_nested_compat(struct rtattr 
*tb[], int maxattr,
 ({ data = RTA_PAYLOAD(rta) = len ? RTA_DATA(rta) : NULL; \
__rtattr_parse_nested_compat(tb, max, rta, len); })
 
-extern int rtnetlink_send(struct sk_buff *skb, u32 pid, u32 group, int echo);
-extern int rtnl_unicast(struct sk_buff *skb, u32 pid);
-extern int rtnl_notify(struct sk_buff *skb, u32 pid, u32 group,
+extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 
group, int echo);
+extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
+extern int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 
group,
   struct nlmsghdr *nlh, gfp_t flags);
-extern void rtnl_set_sk_err(u32 group, int error);
+extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
 extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
  u32 id, u32 ts, u32 tsage, long expires,
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 934c840..f75607a 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -10,6 +10,7 @@
 
 struct proc_dir_entry;
 struct net_device;
+struct sock;
 struct net {
atomic_tcount;  /* To decided when the network
 *  namespace should be freed.
@@ -29,6 +30,8 @@ struct net {
struct list_headdev_base_head;
struct hlist_head   *dev_name_head;
struct hlist_head   *dev_index_head;
+
+   struct sock *rtnl;  /* rtnetlink socket */
 };
 
 #ifdef CONFIG_NET
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index a4ffa2b..f5d6933 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -97,10 +97,10 @@ void br_ifinfo_notify(int event, struct net_bridge_port 
*port)
kfree_skb(skb);
goto errout;
}
-   err = rtnl_notify(skb, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+   err = rtnl_notify(skb, init_net,0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
 errout:
if (err  0)
-   rtnl_set_sk_err(RTNLGRP_LINK, err);
+   rtnl_set_sk_err(init_net, RTNLGRP_LINK, err);
 }
 
 /*
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 357bfa0..03c803c 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -577,10 +577,10 @@ static void notify_rule_change(int event, struct fib_rule 
*rule,
kfree_skb(skb);
goto errout;
}
-   err = rtnl_notify(skb, pid, ops-nlgroup, nlh, GFP_KERNEL);
+   err = rtnl_notify(skb, init_net, pid, ops-nlgroup, nlh, GFP_KERNEL);
 errout:
if (err  0)
-   rtnl_set_sk_err(ops-nlgroup, err);
+   rtnl_set_sk_err(init_net, ops-nlgroup, err);
 }
 
 static void attach_rules(struct list_head *rules, struct net_device *dev)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 27001db..c452584 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2466,10 +2466,10 @@ static void __neigh_notify(struct neighbour *n, int 
type, int flags)
kfree_skb(skb);
goto errout;
}
-   err = rtnl_notify(skb, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+   err = rtnl_notify(skb, init_net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
 errout:
if (err  0)
-   rtnl_set_sk_err(RTNLGRP_NEIGH, err);
+   rtnl_set_sk_err(init_net, RTNLGRP_NEIGH, err);
 }
 
 #ifdef CONFIG_ARPD
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 56fc4f9..fc49104 100644
--- 

Re: [IPV6] Fix ICMPv6 redirect handling with target multicast address

2007-09-28 Thread YOSHIFUJI Hideaki / 吉藤英明
Dave, Brian,

Let me double check this patch.

Regards,

--yoshfuji

In article [EMAIL PROTECTED] (at Fri, 28 Sep 2007 17:50:38 -0700), David 
Stevens [EMAIL PROTECTED] says:

 Brian,
 A multicast address should never be the target of a neighbor
 discovery request; the sender should use the mapping function for all
 multicasts. So, I'm not sure that your example can ever happen, and it
 certainly is ok to send ICMPv6 errors to multicast addresses in general.
 But I don't see that it hurts anything. either (since it should never 
 happen :-)),
 so I don't particularly object, either.
 I think it'd also be better if you add the check to be:
 
 if (ipv6_addr_type(target)  
 (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST))
 
 or something along those lines, rather than reproducing ipv6_addr_type() 
 code
 separately in a new ipv6_addr_linklocal() function.
 
 +-DLS
 
 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5] net: Make the netlink methods in rtnetlink handle multiple network namespaces

2007-09-28 Thread Eric W. Biederman

After the previous prep work this just consists of removing checks
limiting the code to work in the initial network namespace, and
updating rtmsg_ifinfo so we can generate events for devices in
something other then the initial network namespace.

Referring to network other network devices like the IFLA_LINK
and IFLA_MASTER attributes do, gets interesting if those network
devices happen to be in other network namespaces.  Currently
ifindex numbers are allocated globally so I have taken the path
of least resistance and not still report the information even
though the devices they are talking about are invisible.

If applications start getting confused or when ifindex
numbers become local to the network namespace we may need
to do something different in the future.

Signed-off-by: Eric W. Biederman [EMAIL PROTECTED]
---
 net/core/rtnetlink.c |   27 +++
 1 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index fc49104..809a9fb 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -741,9 +741,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct 
netlink_callback *cb)
int s_idx = cb-args[0];
struct net_device *dev;
 
-   if (net != init_net)
-   return 0;
-
idx = 0;
for_each_netdev(net, dev) {
if (idx  s_idx)
@@ -946,9 +943,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct 
nlmsghdr *nlh, void *arg)
struct nlattr *tb[IFLA_MAX+1];
char ifname[IFNAMSIZ];
 
-   if (net != init_net)
-   return -EINVAL;
-
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
if (err  0)
goto errout;
@@ -997,9 +991,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct 
nlmsghdr *nlh, void *arg)
struct nlattr *tb[IFLA_MAX+1];
int err;
 
-   if (net != init_net)
-   return -EINVAL;
-
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
if (err  0)
return err;
@@ -1081,9 +1072,6 @@ static int rtnl_newlink(struct sk_buff *skb, struct 
nlmsghdr *nlh, void *arg)
struct nlattr *linkinfo[IFLA_INFO_MAX+1];
int err;
 
-   if (net != init_net)
-   return -EINVAL;
-
 #ifdef CONFIG_KMOD
 replay:
 #endif
@@ -1210,9 +1198,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct 
nlmsghdr* nlh, void *arg)
struct sk_buff *nskb;
int err;
 
-   if (net != init_net)
-   return -EINVAL;
-
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
if (err  0)
return err;
@@ -1248,13 +1233,9 @@ errout:
 
 static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 {
-   struct net *net = skb-sk-sk_net;
int idx;
int s_idx = cb-family;
 
-   if (net != init_net)
-   return 0;
-
if (s_idx == 0)
s_idx = 1;
for (idx=1; idxNPROTO; idx++) {
@@ -1276,6 +1257,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct 
netlink_callback *cb)
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 {
+   struct net *net = dev-nd_net;
struct sk_buff *skb;
int err = -ENOBUFS;
 
@@ -1290,10 +1272,10 @@ void rtmsg_ifinfo(int type, struct net_device *dev, 
unsigned change)
kfree_skb(skb);
goto errout;
}
-   err = rtnl_notify(skb, init_net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+   err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
 errout:
if (err  0)
-   rtnl_set_sk_err(init_net, RTNLGRP_LINK, err);
+   rtnl_set_sk_err(net, RTNLGRP_LINK, err);
 }
 
 /* Protected by RTNL sempahore.  */
@@ -1392,9 +1374,6 @@ static int rtnetlink_event(struct notifier_block *this, 
unsigned long event, voi
 {
struct net_device *dev = ptr;
 
-   if (dev-nd_net != init_net)
-   return NOTIFY_DONE;
-
switch (event) {
case NETDEV_UNREGISTER:
rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
-- 
1.5.3.rc6.17.g1911

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/5] net: Make AF_PACKET handle multiple network namespaces

2007-09-28 Thread Eric W. Biederman

This is done by making packet_sklist_lock and packet_sklist per
network namespace and adding an additional filter condition on
received packets to ensure they came from the proper network
namespace.

Signed-off-by: Eric W. Biederman [EMAIL PROTECTED]
---
 include/net/net_namespace.h |4 +
 net/packet/af_packet.c  |  129 +++---
 2 files changed, 87 insertions(+), 46 deletions(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index f75607a..5e9fb47 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -32,6 +32,10 @@ struct net {
struct hlist_head   *dev_index_head;
 
struct sock *rtnl;  /* rtnetlink socket */
+
+   /* List of all packet sockets. */
+   rwlock_tpacket_sklist_lock;
+   struct hlist_head   packet_sklist;
 };
 
 #ifdef CONFIG_NET
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e11000a..1c3a5a8 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -135,10 +135,6 @@ dev-hard_header == NULL (ll header is added by device, we 
cannot control it)
packet classifier depends on it.
  */
 
-/* List of all packet sockets. */
-static HLIST_HEAD(packet_sklist);
-static DEFINE_RWLOCK(packet_sklist_lock);
-
 static atomic_t packet_socks_nr;
 
 
@@ -252,9 +248,6 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct 
net_device *dev,  struct
struct sock *sk;
struct sockaddr_pkt *spkt;
 
-   if (dev-nd_net != init_net)
-   goto out;
-
/*
 *  When we registered the protocol we saved the socket in the data
 *  field for just this event.
@@ -276,6 +269,9 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct 
net_device *dev,  struct
if (skb-pkt_type == PACKET_LOOPBACK)
goto out;
 
+   if (dev-nd_net != sk-sk_net)
+   goto out;
+
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
goto oom;
 
@@ -347,7 +343,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct 
socket *sock,
 */
 
saddr-spkt_device[13] = 0;
-   dev = dev_get_by_name(init_net, saddr-spkt_device);
+   dev = dev_get_by_name(sk-sk_net, saddr-spkt_device);
err = -ENODEV;
if (dev == NULL)
goto out_unlock;
@@ -455,15 +451,15 @@ static int packet_rcv(struct sk_buff *skb, struct 
net_device *dev, struct packet
int skb_len = skb-len;
unsigned int snaplen, res;
 
-   if (dev-nd_net != init_net)
-   goto drop;
-
if (skb-pkt_type == PACKET_LOOPBACK)
goto drop;
 
sk = pt-af_packet_priv;
po = pkt_sk(sk);
 
+   if (dev-nd_net != sk-sk_net)
+   goto drop;
+
skb-dev = dev;
 
if (dev-header_ops) {
@@ -572,15 +568,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct 
net_device *dev, struct packe
struct sk_buff *copy_skb = NULL;
struct timeval tv;
 
-   if (dev-nd_net != init_net)
-   goto drop;
-
if (skb-pkt_type == PACKET_LOOPBACK)
goto drop;
 
sk = pt-af_packet_priv;
po = pkt_sk(sk);
 
+   if (dev-nd_net != sk-sk_net)
+   goto drop;
+
if (dev-header_ops) {
if (sk-sk_type != SOCK_DGRAM)
skb_push(skb, skb-data - skb_mac_header(skb));
@@ -738,7 +734,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket 
*sock,
}
 
 
-   dev = dev_get_by_index(init_net, ifindex);
+   dev = dev_get_by_index(sk-sk_net, ifindex);
err = -ENXIO;
if (dev == NULL)
goto out_unlock;
@@ -805,15 +801,17 @@ static int packet_release(struct socket *sock)
 {
struct sock *sk = sock-sk;
struct packet_sock *po;
+   struct net *net;
 
if (!sk)
return 0;
 
+   net = sk-sk_net;
po = pkt_sk(sk);
 
-   write_lock_bh(packet_sklist_lock);
+   write_lock_bh(net-packet_sklist_lock);
sk_del_node_init(sk);
-   write_unlock_bh(packet_sklist_lock);
+   write_unlock_bh(net-packet_sklist_lock);
 
/*
 *  Unhook packet receive handler.
@@ -927,7 +925,7 @@ static int packet_bind_spkt(struct socket *sock, struct 
sockaddr *uaddr, int add
return -EINVAL;
strlcpy(name,uaddr-sa_data,sizeof(name));
 
-   dev = dev_get_by_name(init_net, name);
+   dev = dev_get_by_name(sk-sk_net, name);
if (dev) {
err = packet_do_bind(sk, dev, pkt_sk(sk)-num);
dev_put(dev);
@@ -954,7 +952,7 @@ static int packet_bind(struct socket *sock, struct sockaddr 
*uaddr, int addr_len
 
if (sll-sll_ifindex) {
err = -ENODEV;
-   dev = dev_get_by_index(init_net, sll-sll_ifindex);
+   dev = dev_get_by_index(sk-sk_net, sll-sll_ifindex);
   

Re: MSI interrupts and disable_irq

2007-09-28 Thread Jeff Garzik

Ayaz Abdulla wrote:
I am trying to track down a forcedeth driver issue described by bug 9047 
in bugzilla (2.6.23-rc7-git1 forcedeth w/ MCP55 oops under heavy load). 
I added a patch to synchronize the timer handlers so that one handler 
doesn't accidently enable the IRQ while another timer handler is running 
(see attachment 'Add timer lock' in bug report) and for other processing 
protection.


However, the system still had an Oops. So I added a lock around the 
nv_rx_process_optimized() and the Oops has not happened (see attachment 
'New patch for locking' in bug report). This would imply a 
synchronization issue. However, the only callers of that function are 
the IRQ handler and the timer handlers (in non-NAPI case). The timer 
handlers  use disable_irq so that the IRQ handler does not contend with 
them. It looks as if disable_irq is not working properly.


This issue repros only with MSI interrupt and not legacy INTx 
interrupts. Any ideas?


(added linux-kernel to CC, since I think it's more of a general kernel 
issue)


To be brutally frank, I always thought this disable_irq() mess was a 
hack both ugly and fragile.  This disable_irq() work that appeared in a 
couple net drivers was correct at the time, so I didn't feel I had the 
justification to reject it, but it still gave me a bad feeling.


I think the scenario you outline is an illustration of the approach's 
fragility:  disable_irq() is a heavy hammer that originated with INTx, 
and it relies on a chip-specific disable method (kernel/irq/manage.c) 
that practically guarantees behavior will vary across MSI/INTx/etc.


Practices like forcedeth's unique locking work for a time, but it should 
be a warning sign any time you stray from the normal spin_lock_irqsave() 
method of synchronization.


Based on your report, it is certainly possible that there is a problem 
with MSI's desc-chip-disable() method...  but I would actually 
recommend working around the problem by making the forcedeth locking 
more standardized by removing all those disable_irq() hacks.


Using spinlocks like other net drivers (note: avoid NETIF_F_LLTX 
drivers) has a high probability of both fixing your current problem, and 
giving forcedeth a more stable foundation for the long term.  In my 
humble opinion :)


Jeff


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: MSI interrupts and disable_irq

2007-09-28 Thread Stephen Hemminger
On Fri, 28 Sep 2007 22:47:16 -0400
Jeff Garzik [EMAIL PROTECTED] wrote:

 Ayaz Abdulla wrote:
  I am trying to track down a forcedeth driver issue described by bug 9047 
  in bugzilla (2.6.23-rc7-git1 forcedeth w/ MCP55 oops under heavy load). 
  I added a patch to synchronize the timer handlers so that one handler 
  doesn't accidently enable the IRQ while another timer handler is running 
  (see attachment 'Add timer lock' in bug report) and for other processing 
  protection.
  
  However, the system still had an Oops. So I added a lock around the 
  nv_rx_process_optimized() and the Oops has not happened (see attachment 
  'New patch for locking' in bug report). This would imply a 
  synchronization issue. However, the only callers of that function are 
  the IRQ handler and the timer handlers (in non-NAPI case). The timer 
  handlers  use disable_irq so that the IRQ handler does not contend with 
  them. It looks as if disable_irq is not working properly.
  
  This issue repros only with MSI interrupt and not legacy INTx 
  interrupts. Any ideas?
 
 (added linux-kernel to CC, since I think it's more of a general kernel 
 issue)
 
 To be brutally frank, I always thought this disable_irq() mess was a 
 hack both ugly and fragile.  This disable_irq() work that appeared in a 
 couple net drivers was correct at the time, so I didn't feel I had the 
 justification to reject it, but it still gave me a bad feeling.
 
 I think the scenario you outline is an illustration of the approach's 
 fragility:  disable_irq() is a heavy hammer that originated with INTx, 
 and it relies on a chip-specific disable method (kernel/irq/manage.c) 
 that practically guarantees behavior will vary across MSI/INTx/etc.
 
 Practices like forcedeth's unique locking work for a time, but it should 
 be a warning sign any time you stray from the normal spin_lock_irqsave() 
 method of synchronization.
 
 Based on your report, it is certainly possible that there is a problem 
 with MSI's desc-chip-disable() method...  but I would actually 
 recommend working around the problem by making the forcedeth locking 
 more standardized by removing all those disable_irq() hacks.
 
 Using spinlocks like other net drivers (note: avoid NETIF_F_LLTX 
 drivers) has a high probability of both fixing your current problem, and 
 giving forcedeth a more stable foundation for the long term.  In my 
 humble opinion :)
 

I'll try and clean it up if the author doesn't get to it first.

-- 
Stephen Hemminger [EMAIL PROTECTED]

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC/PATCH 0/3] UDP memory usage accounting

2007-09-28 Thread Herbert Xu
Satoshi OSHIMA [EMAIL PROTECTED] wrote:

 In such case, from 300 to 500MB memory consumption will
 be fatal. Users can easily open 1000 sockets per process
 under default ulimit. If such sockets hold messages but
 user processes don't receive it. Almost all slab will
 be occupied by sk_buff.

Well the solution to that is to have a per-user limit rather
than a system-wide limit.  Otherwise any user can stop system
daemons from using UDP.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PKT_SCHED]: Add stateless NAT

2007-09-28 Thread Herbert Xu
Herbert Xu [EMAIL PROTECTED] wrote:
 On Fri, Sep 28, 2007 at 06:55:32PM +0200, Patrick McHardy wrote:

 Looking at ip_input.o as example (everything without forced inlining):
 
textdata bss dec hex filename
2076   8   02084 824 net/ipv4/ip_input.o
3483   8   03491 da3 net/ipv4/ip_input.o
 
 If it's so big perhaps it should be inlined? It'll be a tail
 call anyway.

s/should be/shouldn't be/
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] rtnl: Simplify ASSERT_RTNL

2007-09-28 Thread Herbert Xu
Eric W. Biederman [EMAIL PROTECTED] wrote:
 
 Currently we have the call path:
 macvlan_open - dev_unicast_add - __dev_set_rx_mode -
__dev_set_promiscuity - ASSERT_RTNL - mutex_trylock
 
 When mutex debugging is on taking a mutex complains if we are not
 allowed to sleep.  At that point we have called netif_tx_lock_bh
 so we are clearly not allowed to sleep.  Arguably this is not a
 problem for mutex_trylock.

Actually holding the TX lock here is a bug.  We're going to
call down into the hardware with __dev_set_promiscuity, which
may sleep (think USB NICs), so we definitely shouldn't be holding
any spin locks.

Patrick, could we avoid taking the TX lock here somehow?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] [3/6] pasemi_mac: set interface speed correctly on XAUI ports

2007-09-28 Thread Jeff Garzik

Olof Johansson wrote:

pasemi_mac: set interface speed correctly on XAUI ports

Set interface speed for XAUI to 10G per default, not 1G.

Signed-off-by: Olof Johansson [EMAIL PROTECTED]


applied 3-6 (davem already got 1-2)


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] mv643xx_eth: Check ETH_INT_CAUSE_STATE bit

2007-09-28 Thread Jeff Garzik

Dale Farnsworth wrote:

Commit 468d09f8946d40228c56de26fe4874b2f98067ed masked the state
interrupt (bit 20 of the cause register). This results in Radstone's
PPC7D repeatedly re-entering the interrupt routine, locking up the
board. The following patch returns the required handling for this
interrupt. 


Signed-off-by: Martyn Welch [EMAIL PROTECTED]
Signed-off-by: Dale Farnsworth [EMAIL PROTECTED]

---
Jeff, this is a bug fix.

 drivers/net/mv643xx_eth.c |2 +-
 drivers/net/mv643xx_eth.h |4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)


applied to #upstream-fixes


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >