Re: [PATCH] add new nfnetlink_log subsystem

2005-08-02 Thread Harald Welte
On Mon, Aug 01, 2005 at 09:54:47PM -0300, Arnaldo Carvalho de Melo wrote:
> On 8/1/05, Arnaldo Carvalho de Melo <[EMAIL PROTECTED]> wrote:
> >   CC  net/netfilter/nfnetlink_queue.o
> > net/netfilter/nfnetlink_queue.c: In function `init_or_cleanup':
> > net/netfilter/nfnetlink_queue.c:1030: error: `proc_net_netfilter'
> > undeclared (first use in this function)
> > net/netfilter/nfnetlink_queue.c:1030: error: (Each undeclared
> > identifier is reported only once
> > net/netfilter/nfnetlink_queue.c:1030: error: for each function it appears 
> > in.)
> > make[2]: ** [net/netfilter/nfnetlink_queue.o] Erro 1
> > make[1]: ** [net/netfilter] Erro 2
> > make: ** [net/] Erro 2
> > [EMAIL PROTECTED] net-2.6.14]$
> > 
> > Are you aware of this one? :-) Perhaps Dave commited some mistake
> > while merging, this is on 2.6.14.git latest.

Yes, it's a problem in davem's tree.  my tree (and the patches I sent)
contain that directory.  I'll try to send a cleanup.

-- 
- Harald Welte <[EMAIL PROTECTED]>  http://gnumonks.org/

"Privacy in residential applications is a desirable marketing option."
  (ETSI EN 300 175-7 Ch. A6)


pgpYKFooIFxK0.pgp
Description: PGP signature


Re: [PATCH] add new nfnetlink_log subsystem

2005-08-02 Thread Harald Welte
Hi Acme!

On Mon, Aug 01, 2005 at 09:54:47PM -0300, Arnaldo Carvalho de Melo wrote:

> > Are you aware of this one? :-) Perhaps Dave commited some mistake
> > while merging, this is on 2.6.14.git latest.
> 
> Ah, this is with 'make allyesconfig'.

Dave's tree is missing the attached patch.

-- 
- Harald Welte <[EMAIL PROTECTED]>  http://gnumonks.org/

"Privacy in residential applications is a desirable marketing option."
  (ETSI EN 300 175-7 Ch. A6)
[NETFILTER] Extend netfilter logging API

This patch is in preparation to nfnetlink_log:
- loggers now have to register struct nf_logger instead of nf_logfn
- nf_log_unregister() replaced by nf_log_unregister_pf() and
  nf_log_unregister_logger()
- add comment to ip[6]t_LOG.h to assure nobody redefines flags
- add /proc/net/netfilter/nf_log to tell user which logger is currently
  registered for which address family
- if user has configured logging, but no logging backend (logger) is
  available, always spit a message to syslog, not just the first time.
- split ip[6]t_LOG.c into two parts:
  Backend: Always try to register as logger for the respective address family
  Frontend: Always log via nf_log_packet() API
- modify all users of nf_log_packet() to accomodate additional argument

Signed-off-by: Harald Welte <[EMAIL PROTECTED]>

---
commit 3770e25a01055bfa8bee52ed1db1f3a5141f
tree bb6232f07be1a0ec617f99d0e479964ac2d66119
parent 70715270f9cedc76099ed628b5444a11127912ca
author laforge <[EMAIL PROTECTED]> Do, 28 Jul 2005 21:27:14 +0200
committer laforge <[EMAIL PROTECTED]> Do, 28 Jul 2005 21:27:14 +0200

 include/linux/netfilter.h|   48 +-
 include/linux/netfilter_ipv4/ipt_LOG.h   |1 
 include/linux/netfilter_ipv6/ip6t_LOG.h  |1 
 net/core/netfilter.c |  127 +++---
 net/ipv4/netfilter/ip_conntrack_proto_icmp.c |8 +-
 net/ipv4/netfilter/ip_conntrack_proto_tcp.c  |   21 ++--
 net/ipv4/netfilter/ip_conntrack_proto_udp.c  |6 +
 net/ipv4/netfilter/ipt_LOG.c |   84 +
 net/ipv4/netfilter/ipt_ULOG.c|   33 +--
 net/ipv6/netfilter/ip6t_LOG.c|   91 ++-
 10 files changed, 295 insertions(+), 125 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -114,15 +114,51 @@ void nf_unregister_sockopt(struct nf_soc
 
 extern struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
 
-typedef void nf_logfn(unsigned int hooknum,
+/* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will
+ * disappear once iptables is replaced with pkttables.  Please DO NOT use them
+ * for any new code! */
+#define NF_LOG_TCPSEQ  0x01/* Log TCP sequence numbers */
+#define NF_LOG_TCPOPT  0x02/* Log TCP options */
+#define NF_LOG_IPOPT   0x04/* Log IP options */
+#define NF_LOG_UID 0x08/* Log UID owning local socket */
+#define NF_LOG_MASK0x0f
+
+#define NF_LOG_TYPE_LOG0x01
+#define NF_LOG_TYPE_ULOG   0x02
+
+struct nf_loginfo {
+   u_int8_t type;
+   union {
+   struct {
+   u_int32_t copy_len;
+   u_int16_t group;
+   u_int16_t qthreshold;
+   } ulog;
+   struct {
+   u_int8_t level;
+   u_int8_t logflags;
+   } log;
+   } u;
+};
+
+typedef void nf_logfn(unsigned int pf,
+ unsigned int hooknum,
  const struct sk_buff *skb,
  const struct net_device *in,
  const struct net_device *out,
+ const struct nf_loginfo *li,
  const char *prefix);
 
+struct nf_logger {
+   struct module   *me;
+   nf_logfn*logfn;
+   char*name;
+};
+
 /* Function to register/unregister log function. */
-int nf_log_register(int pf, nf_logfn *logfn);
-void nf_log_unregister(int pf, nf_logfn *logfn);
+int nf_log_register(int pf, struct nf_logger *logger);
+void nf_log_unregister_pf(int pf);
+void nf_log_unregister_logger(struct nf_logger *logger);
 
 /* Calls the registered backend logging function */
 void nf_log_packet(int pf,
@@ -130,6 +166,7 @@ void nf_log_packet(int pf,
   const struct sk_buff *skb,
   const struct net_device *in,
   const struct net_device *out,
+  struct nf_loginfo *li,
   const char *fmt, ...);

 /* Activate hook; either okfn or kfree_skb called, unless a hook
@@ -221,6 +258,11 @@ struct nf_queue_rerouter {
 extern int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer);
 exte

Re: Fw: Re: [Bugme-new] [Bug 4952] New: IPSec incompabilty. Linux kernel waits to long to start using new SA for outbound traffic.

2005-08-02 Thread Krzysztof Oledzki



On Tue, 2 Aug 2005, Patrick McHardy wrote:


Krzysztof Oledzki wrote:



On Mon, 1 Aug 2005, Herbert Xu wrote:


On Mon, Aug 01, 2005 at 05:46:26AM +0200, Krzysztof Oledzki wrote:



Any new patches to test? ;)



As I said in an earlier message, you should patch racoon to delete
the old *outbound* SA when the new SA has been negotiated.



Did not receive this one, sorry :(. However, the same question was asked
to racoon developers and the answer was, that it is kernel job. They
even pointed that KAME IPSec stack can be tuned to (or not to) prefer
old SA.


The kernel's job is to use a valid SA.


Again... RFC 2408 says: "A protocol implementation SHOULD begin using the 
newly created SA for outbound traffic and SHOULD continue to support 
incoming traffic on the old SA until it is deleted or until traffic is 
received under the protection of the newly created SA." - Section 4.3.



In this case both are valid and the peer is buggy.


The problem is the word SHOULD and IMHO both Linux and the peer are buggy.

So I think the suggestion to work around this in the keying daemons is 
not unreasonable.


There is no need to work around this on *BSD (KAME stack) and the keying 
daemon is exactly the same for both Linux and *BSD.



Best regards,

Krzysztof Olędzki

Re: Network vm deadlock... solution?

2005-08-02 Thread Ingo Oeser
Daniel Phillips wrote:
> On Tuesday 02 August 2005 07:30, Patrick McHardy wrote:
> > IIRC Linus called the whole swapping over ISCSI idea
> > broken because apparently even userspace needs to allocate memory in
> > some situations to make things work.
>
> Even a user space task may run in PF_MEMALLOC mode (and so guarantee its
> syscalls access to a modest amount of working memory) so long as it follows
> the same rules as a kernel task: all per-request memory usage must be
> statically analyzable and bounded.  But I am getting ahead of myself: the
> immediate challenge is to deal accurately with the network-receive facet of
> this multi-facetted problem.

Imagine a IPsec-SA timing out just in this case and (nearly) 
all user space pages swapped out.

We need to renegotiate an IPsec-SA with someone, which needs the 
ISAKMP-daemon, which is completly swapped out. But we have no memory to swap 
it in, since we cannot swap out without transferring to our remote "disk" 
over an IPsec secured connection.

Ugly; you see. That kind of things makes Linus shudder, I think.


Regards

Ingo Oeser

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


ARP entry ageing time

2005-08-02 Thread Partha Chatterjee
Hello,

I wanted to know whether there is any means to get the arp ageing time,
i.e the time left before
the arp entry will be aged out.

Something similar to what Cisco switches support.


Thanks,
Partha
x3025

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC] Idea to speedup tcp lookups]

2005-08-02 Thread Eric Dumazet



 Message original 
Sujet: [RFC] Idea to speedup tcp lookups
Date: Tue, 02 Aug 2005 11:53:12 +0200
De: Eric Dumazet <[EMAIL PROTECTED]>
Pour: David S. Miller 
Copie: [EMAIL PROTECTED],  [EMAIL PROTECTED]
Références: <[EMAIL PROTECTED]>	<[EMAIL PROTECTED]>	<[EMAIL PROTECTED]> 
<[EMAIL PROTECTED]>


Hi David, Hi all

I would like to provide a patch to speedup tcp lookups, but I need your 
comments first.

1) First some peformance data :


tcp_v4_rcv() waste a lot of time in __tcp_v4_lookup_established()

The most critical code is :

sk_for_each(sk, node, &head->chain) {
if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
}

The sk_for_each() does use prefetch() hints but only the begining of "struct 
sock" is prefetched.
So TCP_IPV4_MATCH() has to bring into CPU cache cold cache lines.
Each iteration has to use at least 2 cache lines.

2) The goal
---

The idea I have is to change things so that TCP_IPV4_MATCH() may return FALSE 
in 95% of cases only using the data already in the CPU cache,
using one cache line per iteration.

3) Description of what is planned
--

Changes in layout are to move the "__u16   dport ; __u16   num" from "struct 
inet_sock" to the end of "struct sock_common",
where there is some padding (at least on 64 bits platforms)

File include/net/sock.h

struct sock_common {
unsigned short  skc_family;
volatile unsigned char  skc_state;
unsigned char   skc_reuse;
int skc_bound_dev_if;
struct hlist_node   skc_node;
struct hlist_node   skc_bind_node;
atomic_tskc_refcnt;
+   union  {
+   unsigned int key; /* hash key for fast lookups, or protocol 
private data */
+   unsigned short us[2];
+   } skc_u;
};

File include/linux/ip.h

struct inet_sock {
...
__u32   rcv_saddr;  /* Bound local IPv4 addr */
-   __u16   dport;  /* Destination port */
-   __u16   num;/* Local port */
...
+#define inetsk_dport sk.skc_u.us[0]
+#define inetsk_num sk.skc_u.us[1]

Then change every sk->dport to sk->inetsk_dport, and every sk->num to 
sk->inetsk_num

Doing so even save 8 bytes for sizeof(inet_sock) on 64 bits platforms :)

Then change the the TCP_IPV4_MATCH macro to

File include/net/tcp.h

64 bits platforms :
#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->skc_u.key == (__ports))&&  \
((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie))   &&  \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif

32bits platforms:
#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->skc_u.key == (__ports))&&  \
(inet_sk(__sk)->daddr  == (__saddr))   &&  \
(inet_sk(__sk)->rcv_saddr  == (__daddr))   &&  \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif


This way, the comparison with (__sk->skc_u.key) should reference data already 
fetched is CPU caches, or in the same cache line than
__sk->skc_node (the next element in hash chain)

Discussion :

Instead of using (dport,num) as a key, we could use the tcp_hashfn() value to 
have better fast path, but we would use more memory.
The patch would be nicer, not changing "struct inet_sock".

Thank you for your comments and ideas.

Eric Dumazet

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Bugme-new] [Bug 4952] New: IPSec incompabilty. Linux kernel waits to long to start using new SA for outbound traffic.

2005-08-02 Thread Herbert Xu
On Mon, Aug 01, 2005 at 10:41:33AM +0200, Krzysztof Oledzki wrote:
> 
> RFC 2408 says: "A protocol implementation SHOULD begin using the newly
> created SA for outbound traffic and SHOULD continue to support incoming
> traffic on the old SA until it is deleted or until traffic is received
> under the protection of the newly created SA." - Section 4.3.
> 
> The problem is the word SHOULD and IMHO both Linux and peer are buggy.

The protocol implementation is made up of a kernel component as well as
a user-space component.  IMHO this should be done where it's easiest.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: ARP entry ageing time

2005-08-02 Thread Partha Chatterjee
Actually I tried looking into the neighbor structure, but could not find
any useful hints.

Thanks,
Partha
x3025

-Original Message-
From: [EMAIL PROTECTED] [mailto:[EMAIL PROTECTED]
On Behalf Of Partha Chatterjee
Sent: Tuesday, August 02, 2005 3:24 PM
To: netdev@vger.kernel.org
Subject: ARP entry ageing time

Hello,

I wanted to know whether there is any means to get the arp ageing time,
i.e the time left before
the arp entry will be aged out.

Something similar to what Cisco switches support.


Thanks,
Partha
x3025

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/6] nfnetlink / ctnetlink fixes + updates

2005-08-02 Thread Harald Welte
Hi Dave!

I know you're travelling, but I already have this pile of patches/fixes
for nfnetlink and ctnetlink.  I'm sorry to keep patch-bombing you, but
I think it's better we find those issues right now (before it is in mainline)
rather than later, when it already is in the mainline tree.

Please apply in sequential order (and also apply the nf_log patch you
seem to have missed a short time ago).

Have fun at UKUUG,
Harald

-- 
- Harald Welte <[EMAIL PROTECTED]> http://netfilter.org/

  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."-- Paul Vixie


pgpEfjgoozpO5.pgp
Description: PGP signature


[PATCH 2/6] Fix locking during ctnetlink_new_conntrack()

2005-08-02 Thread Harald Welte
[NETFILTER] conntrack_netlink: Fix locking during conntrack_create

The current codepath allowed for ip_conntrack_lock to be unlock'ed twice.

Signed-off-by: Pablo Neira <[EMAIL PROTECTED]>
Signed-off-by: Harald Welte <[EMAIL PROTECTED]>

---
commit 0432a984f1e502d036115b1f25da8675ee9cebc4
tree aeb9371bcc39ed9c0d005d7959680a7f92a8f8bd
parent c4e2485887523a16e9c37a21a1d95ac10633ef0a
author Harald Welte <[EMAIL PROTECTED]> Di, 02 Aug 2005 09:38:31 +0200
committer Harald Welte <[EMAIL PROTECTED]> Di, 02 Aug 2005 09:38:31 +0200

 net/ipv4/netfilter/ip_conntrack_netlink.c |   13 +++--
 1 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c 
b/net/ipv4/netfilter/ip_conntrack_netlink.c
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -1052,13 +1052,14 @@ ctnetlink_new_conntrack(struct sock *ctn
err = -ENOENT;
if (nlh->nlmsg_flags & NLM_F_CREATE)
err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
+   return err;
+   }
+   /* implicit 'else' */
+
+   /* we only allow nat config for new conntracks */
+   if (cda[CTA_NAT-1]) {
+   err = -EINVAL;
goto out_unlock;
-   } else {
-   /* we only allow nat config for new conntracks */
-   if (cda[CTA_NAT-1]) {
-   err = -EINVAL;
-   goto out_unlock;
-   }
}
 
/* We manipulate the conntrack inside the global conntrack table lock,
-- 
- Harald Welte <[EMAIL PROTECTED]> http://netfilter.org/

  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."-- Paul Vixie


pgpsQAFsoTvzR.pgp
Description: PGP signature


[PATCH 1/6] remove bogus memset() calls from ip_conntrack_netlink.c

2005-08-02 Thread Harald Welte
[NETFILTER] remove bogus memset() calls from ip_conntrack_netlink.c

nfattr_parse_nested() calls nfattr_parse() which in turn does a memset on
the 'tb' array.  All callers therefore don't need to memset before calling
it.

Signed-off-by: Pablo Neira <[EMAIL PROTECTED]>
Signed-off-by: Harald Welte <[EMAIL PROTECTED]>

---
commit c4e2485887523a16e9c37a21a1d95ac10633ef0a
tree a86946c1fa855ed8a76be2efe4fef3c8a5a1c7a6
parent 883b2070f68548537042038d9868c829fc20d463
author Harald Welte <[EMAIL PROTECTED]> Mo, 01 Aug 2005 22:21:59 +0200
committer Harald Welte <[EMAIL PROTECTED]> Mo, 01 Aug 2005 22:21:59 +0200

 net/ipv4/netfilter/ip_conntrack_netlink.c |8 
 1 files changed, 0 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c 
b/net/ipv4/netfilter/ip_conntrack_netlink.c
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -479,7 +479,6 @@ ctnetlink_parse_tuple_ip(struct nfattr *
 
DEBUGP("entered %s\n", __FUNCTION__);
 
-   memset(tb, 0, CTA_IP_MAX * sizeof(tb));

if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0)
goto nfattr_failure;
@@ -522,8 +521,6 @@ ctnetlink_parse_tuple_proto(struct nfatt
 
DEBUGP("entered %s\n", __FUNCTION__);
 
-   memset(tb, 0, CTA_PROTO_MAX * sizeof(tb));
-   
if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0)
goto nfattr_failure;
 
@@ -556,7 +553,6 @@ ctnetlink_parse_tuple(struct nfattr *cda
 
DEBUGP("entered %s\n", __FUNCTION__);
 
-   memset(tb, 0, CTA_TUPLE_MAX * sizeof(tb));
memset(tuple, 0, sizeof(*tuple));
 
if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0)
@@ -607,8 +603,6 @@ static int ctnetlink_parse_nat_proto(str
 
DEBUGP("entered %s\n", __FUNCTION__);
 
-   memset(tb, 0, CTA_PROTONAT_MAX * sizeof(tb));
-
if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0)
goto nfattr_failure;
 
@@ -646,7 +640,6 @@ ctnetlink_parse_nat(struct nfattr *cda[]
 
DEBUGP("entered %s\n", __FUNCTION__);
 
-   memset(tb, 0, CTA_NAT_MAX * sizeof(tb));
memset(range, 0, sizeof(*range));

if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0)
@@ -684,7 +677,6 @@ ctnetlink_parse_help(struct nfattr *attr
struct nfattr *tb[CTA_HELP_MAX];
 
DEBUGP("entered %s\n", __FUNCTION__);
-   memset(tb, 0, CTA_HELP_MAX * sizeof(tb));
 
if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0)
goto nfattr_failure;
-- 
- Harald Welte <[EMAIL PROTECTED]> http://netfilter.org/

  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."-- Paul Vixie


pgpvwckjXkwev.pgp
Description: PGP signature


[PATCH 6/6] don't use nested attributes for conntrack_expect

2005-08-02 Thread Harald Welte
[NETFILTER] don't use nested attributes for conntrack_expect

We used to use nested nfattr structures for ip_conntrack_expect.  This is
bogus, since ip_conntrack and ip_conntrack_expect are communicated in
different netlink message types.  both should be encoded at the top level
attributes, no extra nexting required.  This patch addresses the issue.

Signed-off-by: Harald Welte <[EMAIL PROTECTED]>

---
commit 2c431226a413ddc7071c8db7e4be560a1b49203a
tree c45ad65d012604adc37d597bf27f1634d484d9ff
parent 2f6e0aaca19a462e324ed78f01dfa06e36d73054
author Harald Welte <[EMAIL PROTECTED]> Di, 02 Aug 2005 12:49:18 +0200
committer Harald Welte <[EMAIL PROTECTED]> Di, 02 Aug 2005 12:49:18 +0200

 include/linux/netfilter/nfnetlink_conntrack.h |3 +
 net/ipv4/netfilter/ip_conntrack_netlink.c |   85 +++--
 2 files changed, 41 insertions(+), 47 deletions(-)

diff --git a/include/linux/netfilter/nfnetlink_conntrack.h 
b/include/linux/netfilter/nfnetlink_conntrack.h
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -33,7 +33,6 @@ enum ctattr_type {
CTA_COUNTERS_ORIG,
CTA_COUNTERS_REPLY,
CTA_USE,
-   CTA_EXPECT,
CTA_ID,
__CTA_MAX
 };
@@ -103,10 +102,12 @@ enum ctattr_protonat {
 
 enum ctattr_expect {
CTA_EXPECT_UNSPEC,
+   CTA_EXPECT_MASTER,
CTA_EXPECT_TUPLE,
CTA_EXPECT_MASK,
CTA_EXPECT_TIMEOUT,
CTA_EXPECT_ID,
+   CTA_EXPECT_HELP_NAME,
__CTA_EXPECT_MAX
 };
 #define CTA_EXPECT_MAX (__CTA_EXPECT_MAX - 1)
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c 
b/net/ipv4/netfilter/ip_conntrack_netlink.c
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -1100,18 +1100,21 @@ static inline int
 ctnetlink_exp_dump_expect(struct sk_buff *skb,
   const struct ip_conntrack_expect *exp)
 {
+   struct ip_conntrack *master = exp->master;
u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ);
u_int32_t id = htonl(exp->id);
-   struct nfattr *nest_parms = NFA_NEST(skb, CTA_EXPECT);
 
if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
goto nfattr_failure;
if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0)
goto nfattr_failure;
+   if (ctnetlink_exp_dump_tuple(skb,
+&master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+CTA_EXPECT_MASTER) < 0)
+   goto nfattr_failure;

NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout);
NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id);
-   NFA_NEST_END(skb, nest_parms);
 
return 0;

@@ -1259,10 +1262,8 @@ ctnetlink_get_expect(struct sock *ctnl, 
return 0;
}
 
-   if (cda[CTA_TUPLE_ORIG-1])
-   err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
-   else if (cda[CTA_TUPLE_REPLY-1])
-   err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+   if (cda[CTA_EXPECT_MASTER-1])
+   err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER);
else
return -EINVAL;
 
@@ -1310,13 +1311,33 @@ ctnetlink_del_expect(struct sock *ctnl, 
struct ip_conntrack_helper *h;
int err;
 
-   /* delete by tuple needs either orig or reply tuple */
-   if (cda[CTA_TUPLE_ORIG-1])
-   err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
-   else if (cda[CTA_TUPLE_REPLY-1])
-   err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
-   else if (cda[CTA_HELP_NAME-1]) {
-   char *name = NFA_DATA(cda[CTA_HELP_NAME-1]);
+   if (cda[CTA_EXPECT_TUPLE-1]) {
+   /* delete a single expect by tuple */
+   err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+   if (err < 0)
+   return err;
+
+   /* bump usage count to 2 */
+   exp = ip_conntrack_expect_find_get(&tuple);
+   if (!exp)
+   return -ENOENT;
+
+   if (cda[CTA_EXPECT_ID-1]) {
+   u_int32_t id = 
+   *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
+   if (exp->id != ntohl(id)) {
+   ip_conntrack_expect_put(exp);
+   return -ENOENT;
+   }
+   }
+
+   /* after list removal, usage count == 1 */
+   ip_conntrack_unexpect_related(exp);
+   /* have to put what we 'get' above. 
+* after this line usage count == 0 */
+   ip_conntrack_expect_put(exp);
+   } else if (cda[CTA_EXPECT_HELP_NAME-1]) {
+   char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
 
   

[PATCH 5/6] cleanup nfnetlink_check_attributes()

2005-08-02 Thread Harald Welte
[NETFILTER] cleanup nfnetlink_check_attributes()

1) memset return parameter 'cda' (nfattr pointer array) only on success
2) a message without attributes and just a 'struct nfgenmsg' is valid,
   don't return -EINVAL
3) use likely() and unlikely() where apropriate

Signed-off-by: Harald Welte <[EMAIL PROTECTED]>

---
commit 2f6e0aaca19a462e324ed78f01dfa06e36d73054
tree a0a8b02e5bf16da87d816baad7650151c355126e
parent 9ae30513b70ed5325f66adc02788a7d6ef69cb1e
author Harald Welte <[EMAIL PROTECTED]> Di, 02 Aug 2005 12:24:48 +0200
committer Harald Welte <[EMAIL PROTECTED]> Di, 02 Aug 2005 12:24:48 +0200

 net/netfilter/nfnetlink.c |   19 ++-
 1 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -163,17 +163,16 @@ nfnetlink_check_attributes(struct nfnetl
cb_id, subsys->cb_count);
return -EINVAL;
}
-   
-   attr_count = subsys->cb[cb_id].attr_count;
-
-   memset(cda, 0, sizeof(struct nfattr *) * attr_count);
 
-   /* check attribute lengths. */
min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg));
-   if (nlh->nlmsg_len < min_len)
+   if (unlikely(nlh->nlmsg_len < min_len))
return -EINVAL;
 
-   if (nlh->nlmsg_len > min_len) {
+   attr_count = subsys->cb[cb_id].attr_count;
+   memset(cda, 0, sizeof(struct nfattr *) * attr_count);
+
+   /* check attribute lengths. */
+   if (likely(nlh->nlmsg_len > min_len)) {
struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh));
int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
 
@@ -186,8 +185,10 @@ nfnetlink_check_attributes(struct nfnetl
}
attr = NFA_NEXT(attr, attrlen);
}
-   } else
-   return -EINVAL;
+   }
+
+   /* implicit: if nlmsg_len == min_len, we return 0, and an empty
+* (zeroed) cda[] array. The message is valid, but empty. */
 
 return 0;
 }
-- 
- Harald Welte <[EMAIL PROTECTED]> http://netfilter.org/

  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."-- Paul Vixie


pgpl9YYoZogw7.pgp
Description: PGP signature


[PATCH 3/6] Fix ctnetlink_create_expect() mask parsing

2005-08-02 Thread Harald Welte
[NETFILTER] fix ctnetlink 'create_expect' parsing

There was a stupid copy+paste mistake where we parse the MASK nfattr into
the "tuple" variable instead of the "mask" variable.  This patch fixes it.
Thanks to Pablo Neira.

Signed-off-by: Harald Welte <[EMAIL PROTECTED]>

---
commit 5d03469be285d9bc2b82861d87c667cf5614132d
tree f5551e270c07504ca3f7e234504a3fa2ea6f6728
parent 1f7c0373f4ab32a46ffdee952238b2a596119cb0
author Harald Welte <[EMAIL PROTECTED]> Di, 02 Aug 2005 11:51:56 +0200
committer Harald Welte <[EMAIL PROTECTED]> Di, 02 Aug 2005 11:51:56 +0200

 net/ipv4/netfilter/ip_conntrack_netlink.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c 
b/net/ipv4/netfilter/ip_conntrack_netlink.c
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -1388,7 +1388,7 @@ ctnetlink_create_expect(struct nfattr *c
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
if (err < 0)
return err;
-   err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASK);
+   err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK);
if (err < 0)
return err;
 
-- 
- Harald Welte <[EMAIL PROTECTED]> http://netfilter.org/

  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."-- Paul Vixie


pgp84OdGkhvhz.pgp
Description: PGP signature


[PATCH 4/6] move nfnetlink attr_count from subsys -> callback

2005-08-02 Thread Harald Welte
[NETFILTER] attribute count is an anttribute of message type, not subsytem

Prior to this patch, every nfnetlink subsystem had to specify it's
attribute count.  However, in reality the attribute count depends on the
message type within the subsystem, not the subsystem itself.  This patch
moves 'attr_count' from 'struct nfnetlink_subsys' into nfnl_callback to
fix this.

Signed-off-by: Harald Welte <[EMAIL PROTECTED]>

---
commit 9ae30513b70ed5325f66adc02788a7d6ef69cb1e
tree 993ba601e02c1be1e23896d4b62ac45de0bb7064
parent 5d03469be285d9bc2b82861d87c667cf5614132d
author Harald Welte <[EMAIL PROTECTED]> Di, 02 Aug 2005 12:17:57 +0200
committer Harald Welte <[EMAIL PROTECTED]> Di, 02 Aug 2005 12:17:57 +0200

 include/linux/netfilter/nfnetlink.h   |4 ++--
 net/ipv4/netfilter/ip_conntrack_netlink.c |9 +++--
 net/netfilter/nfnetlink.c |   20 
 net/netfilter/nfnetlink_log.c |5 +++--
 net/netfilter/nfnetlink_queue.c   |4 +++-
 5 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/include/linux/netfilter/nfnetlink.h 
b/include/linux/netfilter/nfnetlink.h
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -85,9 +85,10 @@ struct nfgenmsg {
 
 struct nfnl_callback
 {
-   kernel_cap_t cap_required; /* capabilities required for this msg */
int (*call)(struct sock *nl, struct sk_buff *skb, 
struct nlmsghdr *nlh, struct nfattr *cda[], int *errp);
+   kernel_cap_t cap_required; /* capabilities required for this msg */
+   u_int16_t attr_count;   /* number of nfattr's */
 };
 
 struct nfnetlink_subsystem
@@ -95,7 +96,6 @@ struct nfnetlink_subsystem
const char *name;
__u8 subsys_id; /* nfnetlink subsystem ID */
__u8 cb_count;  /* number of callbacks */
-   u_int32_t attr_count;   /* number of nfattr's */
struct nfnl_callback *cb; /* callback for individual types */
 };
 
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c 
b/net/ipv4/netfilter/ip_conntrack_netlink.c
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -1484,21 +1484,28 @@ static struct notifier_block ctnl_notifi
 
 static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
[IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
+   .attr_count = CTA_MAX,
.cap_required = CAP_NET_ADMIN },
[IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
+   .attr_count = CTA_MAX,
.cap_required = CAP_NET_ADMIN },
[IPCTNL_MSG_CT_DELETE]  = { .call = ctnetlink_del_conntrack,
+   .attr_count = CTA_MAX,
.cap_required = CAP_NET_ADMIN },
[IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
+   .attr_count = CTA_MAX,
.cap_required = CAP_NET_ADMIN },
 };
 
 static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_MAX] = {
[IPCTNL_MSG_EXP_GET]= { .call = ctnetlink_get_expect,
+   .attr_count = CTA_EXPECT_MAX,
.cap_required = CAP_NET_ADMIN },
[IPCTNL_MSG_EXP_NEW]= { .call = ctnetlink_new_expect,
+   .attr_count = CTA_EXPECT_MAX,
.cap_required = CAP_NET_ADMIN },
[IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
+   .attr_count = CTA_EXPECT_MAX,
.cap_required = CAP_NET_ADMIN },
 };
 
@@ -1506,7 +1513,6 @@ static struct nfnetlink_subsystem ctnl_s
.name   = "conntrack",
.subsys_id  = NFNL_SUBSYS_CTNETLINK,
.cb_count   = IPCTNL_MSG_MAX,
-   .attr_count = CTA_MAX,
.cb = ctnl_cb,
 };
 
@@ -1514,7 +1520,6 @@ static struct nfnetlink_subsystem ctnl_e
.name   = "conntrack_expect",
.subsys_id  = NFNL_SUBSYS_CTNETLINK_EXP,
.cb_count   = IPCTNL_MSG_EXP_MAX,
-   .attr_count = CTA_MAX,
.cb = ctnl_exp_cb,
 };
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -155,8 +155,18 @@ nfnetlink_check_attributes(struct nfnetl
   struct nlmsghdr *nlh, struct nfattr *cda[])
 {
int min_len;
+   u_int16_t attr_count;
+   u_int8_t

RE: ARP entry ageing time

2005-08-02 Thread Partha Chatterjee
Hello,

Could someone please help me out. I am desperately looking out for a
solution.

Thanks,
Partha
x3025

-Original Message-
From: Partha Chatterjee 
Sent: Tuesday, August 02, 2005 3:57 PM
To: Partha Chatterjee; netdev@vger.kernel.org
Subject: RE: ARP entry ageing time

Actually I tried looking into the neighbor structure, but could not find
any useful hints.

Thanks,
Partha
x3025

-Original Message-
From: [EMAIL PROTECTED] [mailto:[EMAIL PROTECTED]
On Behalf Of Partha Chatterjee
Sent: Tuesday, August 02, 2005 3:24 PM
To: netdev@vger.kernel.org
Subject: ARP entry ageing time

Hello,

I wanted to know whether there is any means to get the arp ageing time,
i.e the time left before
the arp entry will be aged out.

Something similar to what Cisco switches support.


Thanks,
Partha
x3025

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [openib-general] Re: [Rdma-developers] Meeting (07/22)summary:OpenRDMA community development discussion

2005-08-02 Thread 'Christoph Hellwig'
> Can you provide more details on exactly why you think this is a horrible
> idea?  I agree it will be complex, but it _could_ be scoped such that the
> complexity is reduced.  For instance, the "offload" function could fail
> (with EBUSY or something) if there is _any_ data pending on the socket.
> Thus removing any requirement to pass down pending unacked outgoing data, or
> pending data that has been received but not yet "read" by the application.
> The idea here is that the applications at the top "know" they are going into
> RDMA mode and have effectively quiesced the connection before attempting to
> move the connection into RDMA mode.  We could, in fact, _require_ the
> connect be quiesced to keep things simpler.  I'm quickly sinking into gory
> details, but I want to know if you have other reasons (other than the
> complextity) for why this is a bad idea.

I think your writeup here is more than explanation enough.  The offload
can only work for few special cases, and even for those it's rather
complicated, especially if you take things as ipsec or complex tunneling
that get more and more common into account.  What do you archive by
implementing the offload except trying to make it look more integrated
to the user than it actually is?  Just offload rmda protocols to the
RDMA hardware and keep the IP stack out of that complexity.

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.13-rc4 - kernel panic - BUG at net/ipv4/tcp_output.c:918

2005-08-02 Thread Herbert Xu
On Sun, Jul 31, 2005 at 09:46:53PM -0700, David S. Miller wrote:
> 
> I've tried to avoid doing that, but I may need to capitulate for now.
> My concern was that the divide that thing does has non-trivial cost.

You're right, this could be expensive.
 
> Wait... that's not true, multiple SKBs can have it set already
> if we tso_fragment() or tcp_fragment() and then the tcp_transmit_skb()
> fails (clone allocation failure, for example).

Agreed.

> Another idea is to make tcp_init_tso_segs() reset the values if
> the MSS doesn't match up.  This should work and points out another

That's a much better idea!
 
> Anyways, the following compile-tested-only patch shows my idea.
> What do you think about this Herbert?

The patch looks good.  However, I spotted something that might be
broken due to an earlier change.

> @@ -569,7 +567,7 @@ int tcp_trim_head(struct sock *sk, struc
>* factor and mss.
>*/
>   if (tcp_skb_pcount(skb) > 1)
> - tcp_set_skb_tso_segs(sk, skb);
> + tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));

We've got to be careful here because this is a packet that's already
been transmitted and the caller (tcp_tso_acked) wasn't designed to
handle a decrease in MSS.

Actually it seems that most of the code in tcp_tso_acked will work
with a negative packets_acked.  The only questionable bit is the
change made to fackets_out.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [openib-general] Re: [Rdma-developers] Meeting (07/22)summary:OpenRDMA community development discussion

2005-08-02 Thread Caitlin Bestler
Generally there are two cases to consider: when the TCP mode is not visible
and when it is.

When it is not visible it is certainly easy to manage the TCP connection
with subset logic within the RDMA stack and never involve the host
stack. This is certainly what the initial proposal will rely upon. In 
the long term it has the problems you cited. Having two stacks
accept TCP connections means that *both* must be updated
to stay current with the latest DoS attacks. While it is more 
work for the RDMA device, I think there is general agreement 
amongs the hardware vendors that this is something that the
OS *should* retain control of. Deciding which connections may
be accepted is inherently an OS function.

Beyond that there is a distinct programming model, already 
accepted in IETF specifications, that requires the application
to begin work in streaming (i.e., socket) mode, and then only
convert to RDMA mode once the two peers have agreed upon
that optimization. To support that model you will eventually 
have to allow the host stack to transfer a TCP connection to
the RDMA stack *or* you will require the RDMA stack to 
provide full TCP/socket functionality.

So the real question is not whether to allow the RDMA stack
to "take" a connection from the host stack, but whether to
force the RDMA stack to yield control of the connection to
the host during critical connection setup so that this step
remains firmly under OS control and oversight.


On 8/2/05, Tom Tucker <[EMAIL PROTECTED]> wrote:
>  
>  
>  'Christoph Hellwig' wrote:
>  
>  
>  Can you provide more details on exactly why you think this is a horrible
> idea? I agree it will be complex, but it _could_ be scoped such that the
> complexity is reduced. For instance, the "offload" function could fail
> (with EBUSY or something) if there is _any_ data pending on the socket.
> Thus removing any requirement to pass down pending unacked outgoing data, or
> pending data that has been received but not yet "read" by the application.
> The idea here is that the applications at the top "know" they are going into
> RDMA mode and have effectively quiesced the connection before attempting to
> move the connection into RDMA mode. We could, in fact, _require_ the
> connect be quiesced to keep things simpler. I'm quickly sinking into gory
> details, but I want to know if you have other reasons (other than the
> complextity) for why this is a bad idea.
>  
>  I think your writeup here is more than explanation enough. The offload
> can only work for few special cases, and even for those it's rather
> complicated, especially if you take things as ipsec or complex tunneling
> that get more and more common into account. 
>  I think Steve's point was that it *can* be simplified as necessary to meet
> the demands/needs of the Linux community. It is certainly technically
> possible, but agreeably complicated to offload an active socket.
>  
>  
>  What do you archive by
> implementing the offload except trying to make it look more integrated
> to the user than it actually is? Just offload rmda protocols to the
> RDMA hardware and keep the IP stack out of that complexity.
>  You get the benefit of things like SYN flood DOS attack avoidance built
> into the host stack without replicating this functionality in the offloaded
> adapter. There are other benefits of integration like failover, etc... IMHO,
> however, the bulk of the benefits are for ULP offload like RDMA where the
> remote peer may not be capable of HW RDMA acceleration. This kind of thing
> could be determined in "streaming mode" using the host stack and then
> migrated to an adapter for HW acceleration only if the remote peer is
> capable.
> 
>  
>  
>  ___
> openib-general mailing list
> openib-general@openib.org
> http://openib.org/mailman/listinfo/openib-general
> 
> To unsubscribe, please visit
> http://openib.org/mailman/listinfo/openib-general
>  
>  
>  
> ___
> openib-general mailing list
> openib-general@openib.org
> http://openib.org/mailman/listinfo/openib-general
> 
> To unsubscribe, please visit
> http://openib.org/mailman/listinfo/openib-general
> 
> 
b
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Bugme-new] [Bug 4952] New: IPSec incompabilty. Linux kernel waits to long to start using new SA for outbound traffic.

2005-08-02 Thread Krzysztof Oledzki



On Tue, 2 Aug 2005, Herbert Xu wrote:


On Mon, Aug 01, 2005 at 10:41:33AM +0200, Krzysztof Oledzki wrote:


RFC 2408 says: "A protocol implementation SHOULD begin using the newly
created SA for outbound traffic and SHOULD continue to support incoming
traffic on the old SA until it is deleted or until traffic is received
under the protection of the newly created SA." - Section 4.3.

The problem is the word SHOULD and IMHO both Linux and peer are buggy.


The protocol implementation is made up of a kernel component as well as
a user-space component.  IMHO this should be done where it's easiest.


IMHO userland is not to supposed solve kernel issues.

Best regards,

Krzysztof Olędzki

Re: Network vm deadlock... solution?

2005-08-02 Thread Sridhar Samudrala
On Tue, 2005-08-02 at 06:54 +1000, Daniel Phillips wrote:
> Hi guys,
> 
> Well I have been reading net code seriously for two days, so I am still 
> basically a complete network klutz.  But we have a nasty network-realted vm 
> deadlock that needs fixing and there seems to be little choice but to wade in 
> and try to sort things out.
> 

We are also working on a similar problem where a set of critical TCP
connections need to successfully send/receive messages even under very
low memory conditions. But the assumption is that the low memory
situation lasts only for a short time(in the order of few minutes)
which should not cause any TCP timeouts to expire so that normal
connections can recover once the low memory situation is resolved.

> Here is the plan:
> 
>   * All protocols used on an interface that supports block IO must be
> vm-aware.
> 
> If we wish, we can leave it up to the administrator to ensure that only 
> vm-aware protocols are used on an interface that supports block IO, or we can 
> do some automatic checking.
> 
>   * Any socket to be used for block IO will be marked as a "vmhelper".

I am assuming your 'vmhelper' is similar to a critical socket which can
be marked using a new socket option(ex: SO_CRITICAL).

> 
> The number of protocols that need to have this special knowledge is quite 
> small, e.g.: tcp, udp, sctp, icmp, arp, maybe a few others.  We are talking 
> about a line or two of code in each to add the necessary awareness.
> 
>   * Inside the network driver, when memory is low we will allocate space
> for every incoming packet from a memory reserve, regardless of whether
> it is related to block IO or not.
> 
>   * Under low memory, we call the protocol layer synchronously instead of
> queuing the packet through softnet.
> 
> We do not necessarily have to bypass softnet, since there is a mechanism for 
> thottling packets at this point.  However, there is a big problem with 
> throttling here: we haven't classified the packet yet, so the throttling 
> might discard some block IO packets, which is exactly what we don't want to 
> do under memory pressure.
> 
>   * The protocol receive handler does the socket lookup, then if memory is
> low, discards any packet not belonging to a vmhelper socket.
> 
> Roughly speaking, the driver allocates each skb via:
> 
> skb = memory_pressure ? dev_alloc_skb_reserve() : dev_alloc_skb();

Instead of changing all the drivers to make them vm aware, we could add
a new priority flag(something like GFP_CRITICAL) which can be passed to
__dev_alloc_skb(). dev_alloc_skb becomes
return __dev_alloc_skb(length, GFP_ATOMIC|GFP_CRITICAL);

Based on the memory pressure conditon, the VM can decide if the skb
needs to allocated from an emergency reserve.

> 
> Then the driver hands off the packet to netif_rx, which does:
> 
> if (from_reserve(skb)) {
>   netif_receive_skb(skb);
> return;
>   }
>
> And in the protocol handler we have:
> 
> if (memory_pressure && !is_vmhelper(sock) && from_reserve(skb))
> goto drop_the_packet;

I am not sure if we need the from_reserve() checks above.
We have to assume that all incoming packets are critical until we can
find the matching sk in the protocol handler code.

> 
> That is pretty much it.  Now, being a net newbie, it is not entirely clear to 
> me that we can call netif_receive_skb directly when packets are also being 
> queued through the softnet interface.  May I have some guidance on this 
> point, please?
> 
> If that works, I am prepared to justify and prove the rest.
> 
> Regards,
> 
> Daniel
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] SCTP: fix many sparse endian warnings

2005-08-02 Thread Sridhar Samudrala
On Fri, 2005-07-29 at 03:53 +0400, Alexey Dobriyan wrote:
> * Use __be16, __be32.
> * Add SCTP_NL() in the spirit of SCTP_U32() and friends.
> * Tiny tweak in sctp_chunk_assign_ssn().

How do we run sparse to check for endian errors?
I tried make C=1 net/sctp/sctp.ko, but i didn't see any
endian warnings.

But your changes look fine.

Thanks
Sridhar

> 
> Signed-off-by: Alexey Dobriyan <[EMAIL PROTECTED]>
> ---
> 
>  include/linux/sctp.h   |   60 
> ++---
>  include/net/sctp/command.h |2 +
>  include/net/sctp/sctp.h|2 -
>  include/net/sctp/sm.h  |6 ++--
>  include/net/sctp/structs.h |4 +--
>  include/net/sctp/tsnmap.h  |4 +--
>  include/net/sctp/user.h|2 -
>  net/sctp/associola.c   |2 -
>  net/sctp/ipv6.c|4 +--
>  net/sctp/outqueue.c|2 -
>  net/sctp/protocol.c|2 -
>  net/sctp/sm_make_chunk.c   |   31 +++
>  net/sctp/sm_statefuns.c|   10 +++
>  net/sctp/ulpevent.c|2 -
>  14 files changed, 67 insertions(+), 66 deletions(-)
> 
> diff -uprN linux-vanilla/include/linux/sctp.h linux-sctp/include/linux/sctp.h
> --- linux-vanilla/include/linux/sctp.h2005-07-29 03:31:11.0 
> +0400
> +++ linux-sctp/include/linux/sctp.h   2005-07-29 03:39:50.0 +0400
> @@ -57,17 +57,17 @@
>  
>  /* Section 3.1.  SCTP Common Header Format */
>  typedef struct sctphdr {
> - __u16 source;
> - __u16 dest;
> - __u32 vtag;
> - __u32 checksum;
> + __be16 source;
> + __be16 dest;
> + __be32 vtag;
> + __be32 checksum;
>  } __attribute__((packed)) sctp_sctphdr_t;
>  
>  /* Section 3.2.  Chunk Field Descriptions. */
>  typedef struct sctp_chunkhdr {
>   __u8 type;
>   __u8 flags;
> - __u16 length;
> + __be16 length;
>  } __attribute__((packed)) sctp_chunkhdr_t;
>  
> 
> @@ -153,8 +153,8 @@ enum { SCTP_CHUNK_FLAG_T = 0x01 };
>   */
>  
>  typedef struct sctp_paramhdr {
> - __u16 type;
> - __u16 length;
> + __be16 type;
> + __be16 length;
>  } __attribute__((packed)) sctp_paramhdr_t;
>  
>  typedef enum {
> @@ -203,9 +203,9 @@ enum { SCTP_PARAM_ACTION_MASK = __consta
>  /* RFC 2960 Section 3.3.1 Payload Data (DATA) (0) */
>  
>  typedef struct sctp_datahdr {
> - __u32 tsn;
> - __u16 stream;
> - __u16 ssn;
> + __be32 tsn;
> + __be16 stream;
> + __be16 ssn;
>   __u32 ppid;
>   __u8  payload[0];
>  } __attribute__((packed)) sctp_datahdr_t;
> @@ -232,11 +232,11 @@ enum { SCTP_DATA_FRAG_MASK = 0x03, };
>   *  endpoints.
>   */
>  typedef struct sctp_inithdr {
> - __u32 init_tag;
> - __u32 a_rwnd;
> - __u16 num_outbound_streams;
> - __u16 num_inbound_streams;
> - __u32 initial_tsn;
> + __be32 init_tag;
> + __be32 a_rwnd;
> + __be16 num_outbound_streams;
> + __be16 num_inbound_streams;
> + __be32 initial_tsn;
>   __u8  params[0];
>  } __attribute__((packed)) sctp_inithdr_t;
>  
> @@ -261,7 +261,7 @@ typedef struct sctp_ipv6addr_param {
>  /* Section 3.3.2.1 Cookie Preservative (9) */
>  typedef struct sctp_cookie_preserve_param {
>   sctp_paramhdr_t param_hdr;
> - uint32_tlifespan_increment;
> + __be32lifespan_increment;
>  } __attribute__((packed)) sctp_cookie_preserve_param_t;
>  
>  /* Section 3.3.2.1 Host Name Address (11) */
> @@ -284,7 +284,7 @@ typedef struct sctp_ecn_capable_param {
>  /* ADDIP Section 3.2.6 Adaption Layer Indication */
>  typedef struct sctp_adaption_ind_param {
>   struct sctp_paramhdr param_hdr;
> - __u32 adaption_ind;
> + __be32 adaption_ind;
>  } __attribute__((packed)) sctp_adaption_ind_param_t;
>  
>  /* RFC 2960.  Section 3.3.3 Initiation Acknowledgement (INIT ACK) (2):
> @@ -328,10 +328,10 @@ typedef union {
>  } sctp_sack_variable_t;
>  
>  typedef struct sctp_sackhdr {
> - __u32 cum_tsn_ack;
> - __u32 a_rwnd;
> - __u16 num_gap_ack_blocks;
> - __u16 num_dup_tsns;
> + __be32 cum_tsn_ack;
> + __be32 a_rwnd;
> + __be16 num_gap_ack_blocks;
> + __be16 num_dup_tsns;
>   sctp_sack_variable_t variable[0];
>  } __attribute__((packed)) sctp_sackhdr_t;
>  
> @@ -371,7 +371,7 @@ typedef struct sctp_abort_chunk {
>   * and the highest consecutive acking value.
>   */
>  typedef struct sctp_shutdownhdr {
> - __u32 cum_tsn_ack;
> + __be32 cum_tsn_ack;
>  } __attribute__((packed)) sctp_shutdownhdr_t;
>  
>  struct sctp_shutdown_chunk_t {
> @@ -382,8 +382,8 @@ struct sctp_shutdown_chunk_t {
>  /* RFC 2960.  Section 3.3.10 Operation Error (ERROR) (9) */
>  
>  typedef struct sctp_errhdr {
> - __u16 cause;
> - __u16 length;
> + __be16 cause;
> + __be16 length;
>   __u8  variable[0];
>  } __attribute__((packed)) sctp_errhdr_t;
>  
> @@ -462,7 +462,7 @@ typedef enum {
>   *   Explicit Congestion Notification Echo (ECNE) (12)
>   */
>  typedef struct sctp_ecnehdr {
> - __u32 lowest_tsn;
> + __be32 lowest_t

Re: Network vm deadlock... solution?

2005-08-02 Thread Daniel Phillips
On Tuesday 02 August 2005 19:46, Ingo Oeser wrote:
> Daniel Phillips wrote:
> > On Tuesday 02 August 2005 07:30, Patrick McHardy wrote:
> > > IIRC Linus called the whole swapping over ISCSI idea
> > > broken because apparently even userspace needs to allocate memory in
> > > some situations to make things work.
> >
> > Even a user space task may run in PF_MEMALLOC mode (and so guarantee its
> > syscalls access to a modest amount of working memory) so long as it
> > follows the same rules as a kernel task: all per-request memory usage
> > must be statically analyzable and bounded.  But I am getting ahead of
> > myself: the immediate challenge is to deal accurately with the
> > network-receive facet of this multi-facetted problem.
>
> Imagine a IPsec-SA timing out just in this case and (nearly)
> all user space pages swapped out.
>
> We need to renegotiate an IPsec-SA with someone, which needs the
> ISAKMP-daemon, which is completly swapped out. But we have no memory to
> swap it in, since we cannot swap out without transferring to our remote
> "disk" over an IPsec secured connection.

The problem is that the ISAKMP-daemon was not memlocked, as every piece of 
user space code sitting in the block IO path needs to be.  Not only that, but 
memory-bounded, and either running in PF_MEMALLOC mode or allocating no 
memory, even in libraries or syscalls.  You would be right to think that 
there is a lot of code out there violating these conditions, but when you 
think about them, they are fairly obvious.

> Ugly; you see. That kind of things makes Linus shudder, I think.

But I've been wallowing in exactly these problems now for two years, and there 
is no choice but to solve them if we ever want to have, e.g., reliable 
cluster nodes.

I now have all the nasty bits under control (on paper) except for this network 
bit, which is just about fixed as well, I think.

Regards,

Daniel
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] SCTP: fix many sparse endian warnings

2005-08-02 Thread Randy.Dunlap
On Tue, 2 Aug 2005, Sridhar Samudrala wrote:

> On Fri, 2005-07-29 at 03:53 +0400, Alexey Dobriyan wrote:
> > * Use __be16, __be32.
> > * Add SCTP_NL() in the spirit of SCTP_U32() and friends.
> > * Tiny tweak in sctp_chunk_assign_ssn().
>
> How do we run sparse to check for endian errors?
> I tried make C=1 net/sctp/sctp.ko, but i didn't see any
> endian warnings.

Hi,

You have to enable -Wbitsize , it's not a default option.
See Documentation/sparse.txt or use something like this:

make C=1 CHECK='~/bin/sparse -Wbitsize' all

and I send output to a disk file for easier reading. :)

> But your changes look fine.
>
> Thanks
> Sridhar
>
> >
> > Signed-off-by: Alexey Dobriyan <[EMAIL PROTECTED]>
> > ---
> >
> >  include/linux/sctp.h   |   60 
> > ++---
> >  include/net/sctp/command.h |2 +
> >  include/net/sctp/sctp.h|2 -
> >  include/net/sctp/sm.h  |6 ++--
> >  include/net/sctp/structs.h |4 +--
> >  include/net/sctp/tsnmap.h  |4 +--
> >  include/net/sctp/user.h|2 -
> >  net/sctp/associola.c   |2 -
> >  net/sctp/ipv6.c|4 +--
> >  net/sctp/outqueue.c|2 -
> >  net/sctp/protocol.c|2 -
> >  net/sctp/sm_make_chunk.c   |   31 +++
> >  net/sctp/sm_statefuns.c|   10 +++
> >  net/sctp/ulpevent.c|2 -
> >  14 files changed, 67 insertions(+), 66 deletions(-)

-- 
~Randy
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] LSM-IPSec Networking Hooks -- revised flow cache [resend]

2005-08-02 Thread jaegert
Resend of 20 July patch that repaired the flow_cache_lookup 
authorization (now for 2.6.13-rc4-git4).

Verified that failed authorization results in a new resolution.

Note that the prior [PATCH 2/2] of 18 July works with this patch, so
there will be no resend of it.  Please let me know if a resend is 
necessary.

Regards.
Trent.

PS -- As of August 12, I will be at Penn State where my email will 
be [EMAIL PROTECTED]

This patch series implements per packet access control via the
extension of the Linux Security Modules (LSM) interface by hooks in
the XFRM and pfkey subsystems that leverage IPSec security
associations to label packets.  Extensions to the SELinux LSM are
included that leverage the patch for this purpose.

This patch implements the changes necessary to the XFRM subsystem,
pfkey interface, ipv4/ipv6, and xfrm_user interface to restrict a
socket to use only authorized security associations (or no security
association) to send/receive network packets.

Patch purpose:

The patch is designed to enable access control per packets based on
the strongly authenticated IPSec security association.  Such access
controls augment the existing ones based on network interface and IP
address.  The former are very coarse-grained, and the latter can be
spoofed.  By using IPSec, the system can control access to remote
hosts based on cryptographic keys generated using the IPSec mechanism.
This enables access control on a per-machine basis or per-application
if the remote machine is running the same mechanism and trusted to
enforce the access control policy.

Patch design approach:

The overall approach is that policy (xfrm_policy) entries set by
user-level programs (e.g., setkey for ipsec-tools) are extended with a
security context that is used at policy selection time in the XFRM
subsystem to restrict the sockets that can send/receive packets via
security associations (xfrm_states) that are built from those
policies.  

A presentation available at
www.selinux-symposium.org/2005/presentations/session2/2-3-jaeger.pdf
from the SELinux symposium describes the overall approach.

Patch implementation details: 

On output, the policy retrieved (via xfrm_policy_lookup or
xfrm_sk_policy_lookup) must be authorized for the security context of
the socket and the same security context is required for resultant
security association (retrieved or negotiated via racoon in
ipsec-tools).  This is enforced in xfrm_state_find.

On input, the policy retrieved must also be authorized for the socket
(at __xfrm_policy_check), and the security context of the policy must
also match the security association being used.

The patch has virtually no impact on packets that do not use IPSec.
The existing Netfilter (outgoing) and LSM rcv_skb hooks are used as
before.

Also, if IPSec is used without security contexts, the impact is
minimal.  The LSM must allow such policies to be selected for the
combination of socket and remote machine, but subsequent IPSec
processing proceeds as in the original case.

Testing:

The pfkey interface is tested using the ipsec-tools.  ipsec-tools have
been modified (a separate ipsec-tools patch is available for version
0.5) that supports assignment of xfrm_policy entries and security
associations with security contexts via setkey and the negotiation
using the security contexts via racoon.

The xfrm_user interface is tested via ad hoc programs that set
security contexts.  These programs are also available from me, and
contain programs for setting, getting, and deleting policy for testing
this interface.  Testing of sa functions was done by tracing kernel
behavior.

---

 include/linux/pfkeyv2.h  |   13 +++
 include/linux/security.h |  119 
 include/linux/xfrm.h |   29 
 include/net/flow.h   |8 +-
 include/net/xfrm.h   |   29 +++-
 net/core/flow.c  |   12 ++-
 net/ipv4/xfrm4_policy.c  |2 
 net/ipv6/xfrm6_policy.c  |2 
 net/key/af_key.c |  162 ++--
 net/xfrm/xfrm_policy.c   |   79 +
 net/xfrm/xfrm_state.c|   16 +++-
 net/xfrm/xfrm_user.c |  170 +--
 security/Kconfig |   13 +++
 security/dummy.c |   37 ++
 14 files changed, 635 insertions(+), 56 deletions(-)

diff -puN include/linux/pfkeyv2.h~lsm-xfrm-nethooks include/linux/pfkeyv2.h
--- linux-2.6.13-rc4-xfrm/include/linux/pfkeyv2.h~lsm-xfrm-nethooks 
2005-08-01 16:11:22.0 -0400
+++ linux-2.6.13-rc4-xfrm-root/include/linux/pfkeyv2.h  2005-08-01 
16:11:22.0 -0400
@@ -216,6 +216,16 @@ struct sadb_x_nat_t_port {
 } __attribute__((packed));
 /* sizeof(struct sadb_x_nat_t_port) == 8 */
 
+/* Generic LSM security context */
+struct sadb_x_sec_ctx {
+   uint16_tsadb_x_sec_len;
+   uint16_tsadb_x_sec_exttype;
+   uint8_t sadb_x_ctx_alg;  /* LSMs: e.g., selinux == 1 

Re: Network vm deadlock... solution?

2005-08-02 Thread Daniel Phillips
(cross-posted to linux-mm now)

On Wednesday 03 August 2005 03:27, Sridhar Samudrala wrote:
> On Tue, 2005-08-02 at 06:54 +1000, Daniel Phillips wrote:
> > Hi guys,
> >
> > Well I have been reading net code seriously for two days, so I am still
> > basically a complete network klutz.  But we have a nasty network-realted
> > vm deadlock that needs fixing and there seems to be little choice but to
> > wade in and try to sort things out.
>
> We are also working on a similar problem where a set of critical TCP
> connections need to successfully send/receive messages even under very
> low memory conditions. But the assumption is that the low memory
> situation lasts only for a short time(in the order of few minutes)
> which should not cause any TCP timeouts to expire so that normal
> connections can recover once the low memory situation is resolved.

A few minutes!!!  In cluster applications at least, that TCP timeout can be a 
DOS of the whole cluster, and will certainly cause the node to miss multiple 
heartbeats, thus being ejected from the cluster and possibly rebooted.  So it 
would be better if we can always be sure of handling the block IO traffic, 
barring physical network disconnection or similar.

A point on memory pressure: here, we are not talking about the continuous 
state of running under heavy load, but rather the microscopically short 
periods where not a single page of memory is available to normal tasks.  It 
is when a block IO event happens to land inside one of those microscopically 
short periods that we run into problems.

> > Here is the plan:
> >
> >   * All protocols used on an interface that supports block IO must be
> > vm-aware.
> >
> > If we wish, we can leave it up to the administrator to ensure that only
> > vm-aware protocols are used on an interface that supports block IO, or we
> > can do some automatic checking.
> >
> >   * Any socket to be used for block IO will be marked as a "vmhelper".
>
> I am assuming your 'vmhelper' is similar to a critical socket which can
> be marked using a new socket option(ex: SO_CRITICAL).

Yes.  I originally intended to use the term "vmhelper" to mean any task lying 
in the block IO path, so I would like the terminology to match if possible.  
In general, the above flag denotes a socket that is throttled by its user and 
implies emergency allocation from a particular reserve.  I plan to use the 
classic PF_MEMALLOC reserve because I think this is correct, efficient, and 
more flexible than a mempool.  So really, this flag is SO_MEMALLOC, meaning 
that the socket is properly throttled and can therefore can draw from the 
MEMALLOC pool.

Looking over the thread from march:
   
http://thunker.thunk.org/pipermail/ksummit-2005-discuss/2005-March/000199.html

I see that Andrea is really close to the same idea.  He suggests attaching a 
pointer to a mempool to each socket, and to understand NULL as meaning 
"unthrottled".  This will work, but it is unnecessarily messy.  (The other 
simplification everybody seems to have missed is the easy way of bypassing 
the softnet queues, assuming this works.)

OK, I will go with SO_MEMALLOC for the time being.  SO_CRITICAL could mean so 
many other things.

> > The number of protocols that need to have this special knowledge is quite
> > small, e.g.: tcp, udp, sctp, icmp, arp, maybe a few others.  We are
> > talking about a line or two of code in each to add the necessary
> > awareness.
> >
> >   * Inside the network driver, when memory is low we will allocate space
> > for every incoming packet from a memory reserve, regardless of
> > whether it is related to block IO or not.
> >
> >   * Under low memory, we call the protocol layer synchronously instead of
> > queuing the packet through softnet.
> >
> > We do not necessarily have to bypass softnet, since there is a mechanism
> > for thottling packets at this point.  However, there is a big problem
> > with throttling here: we haven't classified the packet yet, so the
> > throttling might discard some block IO packets, which is exactly what we
> > don't want to do under memory pressure.
> >
> >   * The protocol receive handler does the socket lookup, then if memory
> > is low, discards any packet not belonging to a vmhelper socket.
> >
> > Roughly speaking, the driver allocates each skb via:
> >
> > skb = memory_pressure ? dev_alloc_skb_reserve() :
> > dev_alloc_skb();
>
> Instead of changing all the drivers to make them vm aware, we could add
> a new priority flag(something like GFP_CRITICAL) which can be passed to
> __dev_alloc_skb(). dev_alloc_skb becomes
> return __dev_alloc_skb(length, GFP_ATOMIC|GFP_CRITICAL);

Good point: there is no need for the alloc_skb_reserve variant.  To be 
consistent, this would be GFP_ATOMIC|GFP_MEMALLOC.  The point is, we allow 
atomic allocation to dig right to the bottom of available physical memory 
under these conditions.  If we hit bottom, it was a static analysis error and 
we deserve to die[1].

> Ba

Re: [PATCH] SCTP: fix many sparse endian warnings

2005-08-02 Thread Alexey Dobriyan
On Tue, Aug 02, 2005 at 11:13:39AM -0700, Randy.Dunlap wrote:
> On Tue, 2 Aug 2005, Sridhar Samudrala wrote:
> 
> > On Fri, 2005-07-29 at 03:53 +0400, Alexey Dobriyan wrote:
> > > * Use __be16, __be32.
> > > * Add SCTP_NL() in the spirit of SCTP_U32() and friends.
> > > * Tiny tweak in sctp_chunk_assign_ssn().
> >
> > How do we run sparse to check for endian errors?
> > I tried make C=1 net/sctp/sctp.ko, but i didn't see any
> > endian warnings.

> You have to enable -Wbitsize , it's not a default option.

-Wbitwise

> See Documentation/sparse.txt or use something like this:
> 
> make C=1 CHECK='~/bin/sparse -Wbitsize' all

make C=1 CHECK="sparse -Wbitwise" net/sctp/

assuming you have ~/bin in PATH (sparse is installed there by default).

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC,PATCH] Idea to speedup tcp lookups

2005-08-02 Thread Eric Dumazet

Eric Dumazet a écrit :



Hi David, Hi all

I would like to provide a patch to speedup tcp lookups, but I need your 
comments first.


1) First some peformance data :


tcp_v4_rcv() waste a lot of time in __tcp_v4_lookup_established()

The most critical code is :

sk_for_each(sk, node, &head->chain) {
if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
}

The sk_for_each() does use prefetch() hints but only the begining of 
"struct sock" is prefetched.

So TCP_IPV4_MATCH() has to bring into CPU cache cold cache lines.
Each iteration has to use at least 2 cache lines.

2) The goal
---

The idea I have is to change things so that TCP_IPV4_MATCH() may return 
FALSE in 95% of cases only using the data already in the CPU cache,

using one cache line per iteration.

3) Description of what is planned
--

Changes in layout are to move the "__u16   dport ; 
__u16   num" from "struct inet_sock" to the end of "struct 
sock_common",

where there is some padding (at least on 64 bits platforms)

File include/net/sock.h

struct sock_common {
unsigned short  skc_family;
volatile unsigned char  skc_state;
unsigned char   skc_reuse;
int skc_bound_dev_if;
struct hlist_node   skc_node;
struct hlist_node   skc_bind_node;
atomic_tskc_refcnt;
+union  {
+unsigned int key; /* hash key for fast lookups, or protocol 
private data */

+unsigned short us[2];
+} skc_u;
};

File include/linux/ip.h

struct inet_sock {
...
__u32   rcv_saddr;  /* Bound local IPv4 addr */
-__u16   dport;  /* Destination port */
-__u16   num;/* Local port */
...
+#define inetsk_dport sk.skc_u.us[0]
+#define inetsk_num sk.skc_u.us[1]

Then change every sk->dport to sk->inetsk_dport, and every sk->num to 
sk->inetsk_num


Doing so even save 8 bytes for sizeof(inet_sock) on 64 bits platforms :)

Then change the the TCP_IPV4_MATCH macro to

File include/net/tcp.h

64 bits platforms :
#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->skc_u.key == (__ports))&&  \
((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie))   &&  \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif

32bits platforms:
#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->skc_u.key == (__ports))&&  \
(inet_sk(__sk)->daddr  == (__saddr))   &&  \
(inet_sk(__sk)->rcv_saddr  == (__daddr))   &&  \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif


This way, the comparison with (__sk->skc_u.key) should reference data 
already fetched is CPU caches, or in the same cache line than

__sk->skc_node (the next element in hash chain)

Discussion :

Instead of using (dport,num) as a key, we could use the tcp_hashfn() 
value to have better fast path, but we would use more memory.

The patch would be nicer, not changing "struct inet_sock".

Thank you for your comments and ideas.

Eric Dumazet


Here is the first version of patch that implements fast tcp / udp lookups, using the (dport,num) as a key, stored in struct sock_common, in 
the same cache line than the 'next pointer' used in iterator (even for 32 bytes cache lines)


Fairly large patch because I had to track all references to inet->num and inet->dport, but small if you look only at include files were all 
the patch logic is really done. I hope the size of the patch is OK for netdev ?


Tested on x86_64 and i686 machines. No performance results yet.

changed files are :

include/net/sock.h
include/linux/ip.h
include/net/tcp.h
include/net/udp.h

net/ipv4/af_inet.c
net/ipv4/datagram.c
net/ipv4/ip_input.c
net/ipv4/ip_output.c
net/ipv4/ip_sockglue.c
net/ipv4/netfilter/ip_conntrack_core.c
net/ipv4/raw.c
net/ipv4/tcp.c
net/ipv4/tcp_diag.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_minisocks.c
net/ipv4/tcp_output.c
net/ipv4/tcp_timer.c
net/ipv4/udp.c
net/ipv4/ipmr.c

net/ipv6/af_inet6.c
net/ipv6/datagram.c
net/ipv6/ipv6_sockglue.c
net/ipv6/raw.c
net/ipv6/tcp_ipv6.c
net/ipv6/udp.c

security/selinux/avc.c

Thank you

Signed-off-by: Eric Dumazet <[EMAIL PROTECTED]>
--- linux-2.6.13-rc5/include/net/sock.h 2005-08-02 06:45:48.0 +0200
+++ linux-2.6.13-rc5-ed/include/net/sock.h  2005-08-02 20:10:28.0 
+0200
@@ -98,6 +98,7 @@
  * @skc_node: main hash linkage for various protocol lookup tables
  * @skc_bind_node: bind hash linkage for various protocol lookup tables
  * @skc_refcnt: reference count
+ * @skc_u: a hash key to speedup lookups
  *
  * This is the minimal network layer representation of sockets, the header
  * for struct sock and struct tcp_tw_bucket.
@@ -110,6 +111,10 @@
struct hlist_node   skc_node;
struct hlist_node   skc_bind_node;
atomic_t  

Re: [RFC,PATCH] Idea to speedup tcp lookups

2005-08-02 Thread Arnaldo Carvalho de Melo
Hi Eric,

 Interesting, I'm reading it now that I finished one more changeset on the
generalisation of all the tcp_hashinfo routines, could you please take
a look at:

http://www.kernel.org/git/?p=linux/kernel/git/acme/timewait-2.6.14.git;a=summary

?

Good thing is that any optimization you do for TCP will be useful for
DCCP as well,
perhaps SCTP at some point :-)

- Arnaldo

On 8/2/05, Eric Dumazet <[EMAIL PROTECTED]> wrote:
> Eric Dumazet a écrit :
> 
> >
> > Hi David, Hi all
> >
> > I would like to provide a patch to speedup tcp lookups, but I need your
> > comments first.
> >
> > 1) First some peformance data :
> > 
> >
> > tcp_v4_rcv() waste a lot of time in __tcp_v4_lookup_established()
> >
> > The most critical code is :
> >
> > sk_for_each(sk, node, &head->chain) {
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Network vm deadlock... solution?

2005-08-02 Thread Francois Romieu
Daniel Phillips <[EMAIL PROTECTED]> :
[...]
> A point on memory pressure: here, we are not talking about the continuous 
> state of running under heavy load, but rather the microscopically short 
> periods where not a single page of memory is available to normal tasks.  It 
> is when a block IO event happens to land inside one of those microscopically 
> short periods that we run into problems.

You suggested in a previous message to use an emergency allocation pool at
the driver level. Afaik, 1) the usual network driver can already buffer a
bit with its Rx descriptor ring and 2) it more or less tries to refill it
each time napi issues its ->poll() method. So it makes me wonder:
- have you collected evidence that the drivers actually run out of memory
  in the (microscopical) situation you describe ?
- instead of modifying each and every driver to be vm aware, why don't
  you hook in net_rx_action() when memory starts to be low ?

Btw I do not get what the mempool/GFP_CRITICAL idea buys: it seems redundant
with the threshold ("if (memory_pressure)") used in the Rx path to decide
that memory is low.

--
Ueimor
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC,PATCH] Idea to speedup tcp lookups

2005-08-02 Thread Eric Dumazet

Arnaldo Carvalho de Melo a écrit :

Hi Eric,

 Interesting, I'm reading it now that I finished one more changeset on the
generalisation of all the tcp_hashinfo routines, could you please take
a look at:

http://www.kernel.org/git/?p=linux/kernel/git/acme/timewait-2.6.14.git;a=summary

?



yes :)  Thanks for the link !

About 
http://www.kernel.org/git/?p=linux/kernel/git/acme/timewait-2.6.14.git;a=commitdiff;h=7d3cecee43811f9493bff3fa794bf30b77e4e3cd

Please move kmem_cache_t *bind_bucket_cachep; in a better place, not just after 
spinlock_t portalloc_lock;

bind_bucket_cachep is a read only pointer (only written at boot time), and if you place it on a cache line that is heavily written (because 
of atomic counters and spinlocks), processors wont be able to cache a local copy of this pointer.


About inet_lhashfn() and other 'hash functions', its better to make them return 'unsigned int', so that 64 bits platforms dont have to sign 
extend when using the result to index a table. The generated code would be shorter.


Thank you


Good thing is that any optimization you do for TCP will be useful for
DCCP as well,
perhaps SCTP at some point :-)

- Arnaldo

On 8/2/05, Eric Dumazet <[EMAIL PROTECTED]> wrote:


Eric Dumazet a écrit :



Hi David, Hi all

I would like to provide a patch to speedup tcp lookups, but I need your
comments first.

1) First some peformance data :


tcp_v4_rcv() waste a lot of time in __tcp_v4_lookup_established()

The most critical code is :

sk_for_each(sk, node, &head->chain) {


-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html




-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC,PATCH] Idea to speedup tcp lookups

2005-08-02 Thread Arnaldo Carvalho de Melo
On 8/2/05, Eric Dumazet <[EMAIL PROTECTED]> wrote:
> Arnaldo Carvalho de Melo a écrit :
> > Hi Eric,
> >
> >  Interesting, I'm reading it now that I finished one more changeset on 
> > the
> > generalisation of all the tcp_hashinfo routines, could you please take
> > a look at:
> >
> > http://www.kernel.org/git/?p=linux/kernel/git/acme/timewait-2.6.14.git;a=summary
> >
> > ?
> >
> 
> yes :)  Thanks for the link !

You're welcome :)

> About 
> http://www.kernel.org/git/?p=linux/kernel/git/acme/timewait-2.6.14.git;a=commitdiff;h=7d3cecee43811f9493bff3fa794bf30b77e4e3cd
> 
> Please move kmem_cache_t *bind_bucket_cachep; in a better place, not just 
> after spinlock_t portalloc_lock;
> 
> bind_bucket_cachep is a read only pointer (only written at boot time), and if 
> you place it on a cache line that is heavily written (because
> of atomic counters and spinlocks), processors wont be able to cache a local 
> copy of this pointer.
> 
> About inet_lhashfn() and other 'hash functions', its better to make them 
> return 'unsigned int', so that 64 bits platforms dont have to sign
> extend when using the result to index a table. The generated code would be 
> shorter.

Thanks for the suggestions, I'm mostly moving, slowly, not changing the existing
code too much in its basic functionality, to a new sock infrastructure
based on the hard work Dave, Alexey and others did on TCP, there is for sure
more work to do and improvements to be done, like a generic net_diag, based
on today's tcp_diag, etc.

I have lots of other patches in my experimental LLC tree, where I toyed with
generalising tcp_sendmsg, tcp_recvmsg, tcp_sendpages, etc, and many more
in my DCCP tree, that is mostly what I'm flushing now.

Eventually I'll move on to the other legacy protocols making them use this
infrastructure, for instance this file has one possible outcome:

http://userweb.kernel.org/~acme/sock_hierarchy.ps

Parts are already merged, parts are about to be submitted to David, like
the partial timewait_sock hierarchy (for now restricted to INET transport
protocols) and other bits I still have to dig out of my experimental trees,
most probably after looking at what was done again and reworking some
parts.

> Thank you

Thank you too!

> > Good thing is that any optimization you do for TCP will be useful for
> > DCCP as well,
> > perhaps SCTP at some point :-)
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Network vm deadlock... solution?

2005-08-02 Thread Martin J. Bligh


--Francois Romieu <[EMAIL PROTECTED]> wrote (on Tuesday, August 02, 2005 
23:43:40 +0200):

> Daniel Phillips <[EMAIL PROTECTED]> :
> [...]
>> A point on memory pressure: here, we are not talking about the continuous 
>> state of running under heavy load, but rather the microscopically short 
>> periods where not a single page of memory is available to normal tasks.  It 
>> is when a block IO event happens to land inside one of those microscopically 
>> short periods that we run into problems.
> 
> You suggested in a previous message to use an emergency allocation pool at
> the driver level. Afaik, 1) the usual network driver can already buffer a
> bit with its Rx descriptor ring and 2) it more or less tries to refill it
> each time napi issues its ->poll() method. So it makes me wonder:
> - have you collected evidence that the drivers actually run out of memory
>   in the (microscopical) situation you describe ?

There's other situations where it does (ie swap device dies, etc).

> - instead of modifying each and every driver to be vm aware, why don't
>   you hook in net_rx_action() when memory starts to be low ?
> 
> Btw I do not get what the mempool/GFP_CRITICAL idea buys: it seems redundant
> with the threshold ("if (memory_pressure)") used in the Rx path to decide
> that memory is low.

It's send-side, not receive.

M.

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Network vm deadlock... solution?

2005-08-02 Thread Daniel Phillips
On Wednesday 03 August 2005 07:43, Francois Romieu wrote:
> Daniel Phillips <[EMAIL PROTECTED]> :
> [...]
>
> > A point on memory pressure: here, we are not talking about the continuous
> > state of running under heavy load, but rather the microscopically short
> > periods where not a single page of memory is available to normal tasks. 
> > It is when a block IO event happens to land inside one of those
> > microscopically short periods that we run into problems.
>
> You suggested in a previous message to use an emergency allocation pool at
> the driver level. Afaik, 1) the usual network driver can already buffer a
> bit with its Rx descriptor ring and 2) it more or less tries to refill it
> each time napi issues its ->poll() method. So it makes me wonder:
> - have you collected evidence that the drivers actually run out of memory
>   in the (microscopical) situation you describe ?

Yes, e.g:

   
http://thunker.thunk.org/pipermail/ksummit-2005-discuss/2005-March/000200.html

and NBD is known to be unreliable for this reason.  I plan to put together
a before-and-after test that everybody can try, but after I show the patch for
comment.

> - instead of modifying each and every driver to be vm aware, why don't
>   you hook in net_rx_action() when memory starts to be low ?

Two reasons:

  * The first handling has to be where the packet is allocated

  * net_rx_action is on the far side of a queue, which would need to be
throttled separately.  But the throttle would not know which packets to
discard, because the packet headers have not been examined yet.

> Btw I do not get what the mempool/GFP_CRITICAL idea buys: it seems
> redundant with the threshold ("if (memory_pressure)") used in the Rx path
> to decide that memory is low.

It is not to decide if memory is low, but to tell the vm system that it is
allowed to allocate from the reserve if normal memory is exhausted.

Regards,

Daniel.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Network vm deadlock... solution?

2005-08-02 Thread Daniel Phillips
On Wednesday 03 August 2005 08:39, Martin J. Bligh wrote:
> --Francois Romieu <[EMAIL PROTECTED]> wrote (on Tuesday, August 02, 2005 
> > Btw I do not get what the mempool/GFP_CRITICAL idea buys: it seems
> > redundant with the threshold ("if (memory_pressure)") used in the Rx path
> > to decide that memory is low.
>
> It's send-side, not receive.

Receive side.  Send side also needs reserve+throttling but it is easier 
because we flag packets at allocation time for special handling.

Regards,

Daniel
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[ANNOUNCE] Chelsio 10Gb TOE (TCP Offload Engine)

2005-08-02 Thread Scott Bardone

Chelsio Communications would like to announce the availability of its
TCP offload (TOE) support for Linux under the GPL.  This is code
developed by us over the past couple of years and has been in production
for over a year.  The code, architecture description, and some papers
comparing TOE performance to other technologies are available from 
https://service.chelsio.com/open_toe/index.html.


We are aware that TOEs are viewed with much skepticism in the Linux
community but we believe that a lot of the concerns often brought up
have to do with implementation details of particular products rather
than with the technology as a whole.  Chelsio is proposing a solution
that we feel allows TOEs to coexist alongside the regular stack's TCP
without breaking networking features, and allows the combined network
stack to offer superior TCP performance.  The code we are releasing
today has been used with an older version of Linux to set the current
Internet2 land speed record and has demonstrated improved performance
with a variety of applications and benchmarks.  As another example of
performance benefits, while today's NICs cannot handle 10G receive with
regular frames, a TOE can comfortably do so with much of the CPU left
for application processing.

The proposed design is intended to accommodate products from multiple
vendors and roughly has the following components:

- a vendor neutral cut-down analog of core/dev.c that provides
registration and activation facilities for TOE devices and some basic
data path functionality (mostly to deal with sniffers).  This component
does not introduce any new soft irqs, instead TOE devices use regular
facilities, such as NAPI, to service incoming traffic;

- some changes to existing TCP code and some additions to provide
offloading.  Changes to existing code are a few dozen lines and are
usually either notification of TOEs when the SW stack processes certain
events, e.g., ARP, or they allow TOEs to perform some socket operations
differently from the SW stack (usually this is done by changing sk_prot,
but some of the differing operations aren't covered by that and so need
to be done through other changes);

- the offloading support specific to each TOE is provided by two
drivers, one that deals with HW and one that interfaces with the SW
TCP/IP and sockets layer (these are separate conceptually, they may be
one driver implementation-wise)

More details of the proposed scheme and of the working of the various
operations can be found in the architecture document at the above URL.
We are including a patch containing the TCP changes below (against
2.6.12), and the rest of the vendor-neutral pieces will follow in
subsequent emails.  We are not posting the drivers on the list due to
their size (the TOE driver though is an extension of Chelsio's NIC
driver presently in Jeff's tree).  All the code is available at the
above URL.  (We'd like to point out that the released code is our
current production codebase that accommodates both 2.4 and 2.6 kernels.
We are aware that we'll need to strip the compatibility stuff and plan
to do so.)

Thanks for your attention and we are looking forward to your comments.

Chelsio Communications.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ANNOUNCE] Chelsio 10Gb TOE (TCP Offload Engine)

2005-08-02 Thread Scott Bardone

Attached is the patch for kernel version 2.6.13-rc3.
diff -Naur linux-2.6.13-rc3/include/linux/tcp.h 
linux-2.6.13-rc3.patched/include/linux/tcp.h
--- linux-2.6.13-rc3/include/linux/tcp.h2005-07-26 10:16:10.0 
-0700
+++ linux-2.6.13-rc3.patched/include/linux/tcp.h2005-07-26 
12:23:53.795497200 -0700
@@ -235,6 +235,8 @@
return (struct tcp_request_sock *)req;
 }
 
+struct toe_funcs;
+
 struct tcp_sock {
/* inet_sock has to be the first member of tcp_sock */
struct inet_sockinet;
@@ -342,6 +344,8 @@
 
struct tcp_func *af_specific;   /* Operations which are 
AF_INET{4,6} specific   */
 
+   struct toe_funcs*toe_specific; /* Operations overriden by TOEs 
*/
+
__u32   rcv_wnd;/* Current receiver window  */
__u32   rcv_wup;/* rcv_nxt on last window update sent   */
__u32   write_seq;  /* Tail(+1) of data held in tcp send buffer */
diff -Naur linux-2.6.13-rc3/include/linux/toedev.h 
linux-2.6.13-rc3.patched/include/linux/toedev.h
--- linux-2.6.13-rc3/include/linux/toedev.h 1969-12-31 16:00:00.0 
-0800
+++ linux-2.6.13-rc3.patched/include/linux/toedev.h 2005-07-26 
12:23:53.796497048 -0700
@@ -0,0 +1,129 @@
+/*
+ *   *
+ * File: *
+ *  toedev.h *
+ *   *
+ * Description:  *
+ *  TOE device definitions.  *
+ *   *
+ * This program is free software; you can redistribute it and/or modify  *
+ * it under the terms of the GNU General Public License, version 2, as   *
+ * published by the Free Software Foundation.*
+ *   *
+ * You should have received a copy of the GNU General Public License along   *
+ * with this program; if not, write to the Free Software Foundation, Inc.,   *
+ * 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. *
+ *   *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED*
+ * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF  *
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. *
+ *   *
+ * http://www.chelsio.com*
+ *   *
+ * Copyright (c) 2003 - 2005 Chelsio Communications, Inc.*
+ * All rights reserved.  *
+ *   *
+ * Maintainers: [EMAIL PROTECTED]  *
+ *   *
+ * Authors: Dimitrios Michailidis   <[EMAIL PROTECTED]>
 *
+ *   *
+ * History:  *
+ *   *
+ /
+/* $Date: 2005/07/09 00:52:28 $ $RCSfile: toedev.h,v $ $Revision: 1.14 $ */
+
+#ifndef _TOEDEV_H_
+#define _TOEDEV_H_
+
+#include 
+#include 
+
+#define TOENAMSIZ 16
+
+/* belongs in linux/netdevice.h */
+#define NETIF_F_TCPIP_OFFLOAD (1 << 16)
+
+/* Get the toedev associated with a net_device */
+#define TOEDEV(netdev) ((struct toedev *)(netdev)->ec_ptr)
+
+/* TOE type ids */
+enum {
+TOE_ID_CHELSIO_T1  = 1,
+TOE_ID_CHELSIO_T1C,
+TOE_ID_CHELSIO_T3,
+};
+
+struct toe_id {
+unsigned int id;
+unsigned long data;
+};
+
+#define END_OF_TOE_ID_TABLE { 0, 0UL }
+
+struct net_device;
+struct neighbour;
+struct tom_info;
+struct proc_dir_entry;
+struct sock;
+struct sk_buff;
+
+struct toedev {
+char name[TOENAMSIZ];   /* TOE device name */
+struct list_head toe_list;  /* for list linking */
+int toe_index;  /* unique TOE device index */
+unsigned int ttid;  /* TOE type id */
+unsigned long flags;/* device flags */
+unsigned int mtu;   /* max size of TX offloaded data */ 
+unsigned int nconn; /* max # of offl