Hi,

Attached is a patch that makes my small test program working. I
applies to 5.1 and 5.1.1 only. Porting it to current would be a bit
harder due to the port randomization, as described by Eric
previously.

This is just a proof of concept and I would be happy to have some
feedback about how to write it better and what are the potential
issues.

Olivier
>From 61c4012c89cd088f8f6e3f16f5e1306104232b28 Mon Sep 17 00:00:00 2001
From: Olivier Matz <olivier.m...@6wind.com>
Date: Thu, 2 Feb 2012 16:51:05 +0100
Subject: tcp: allow to reuse an ephemeral port if dest addr/port is different

When a TCP client calls connect(), an implicit bind is done by the
network stack to choose an ephemeral port. Currently, there is a
limitation that prevent the tcp client to open many ephemeral ports even
if the destination port or address is different.

The problem is described in details there:
http://mail-index.netbsd.org/tech-kern/2012/01/30/msg012602.html

The goal of this patch is to allow duplicate the code of in_pcbbind() in
a new function in_pcbbind_before_connect() that is called specifically
by the TCP connect code when doing an implicit bind. The behaviour is a
bit different compared to the initial in_pcbbind():

- only the (nam == NULL) case is allowed
- the function is aware of remote address that will be given to the
  connect(). The duplication of the ephemeral port is checked by a
  in_pcblookup_connect() instead of a in_pcblookup_port().
- the socket state is not changed to BOUND (but the the pcb is added in
  the INPCBHASH_PORT table). The connect() will change the state to
  CONNECTED if it is successful.

If the in_pcbconnect() fails, we need to restore the initial state:
inp->in_port to 0, tcp in INPCBHASH_PORT table[0], remove INP_ANONPORT
flag.

Note: this patch is just a proof of concept and should probably be
cleaned and enhanced. Currently, only IPv4 is done.
---
 netinet/in_pcb.c     |   88 ++++++++++++++++++++++++++++++++++++++++++++++++++
 netinet/in_pcb.h     |    2 +
 netinet/tcp_usrreq.c |   10 +++++-
 3 files changed, 99 insertions(+), 1 deletions(-)

diff --git a/netinet/in_pcb.c b/netinet/in_pcb.c
index 5d662ce..498a344 100644
--- a/netinet/in_pcb.c
+++ b/netinet/in_pcb.c
@@ -371,6 +371,94 @@ noname:
 	return (0);
 }
 
+int
+in_pcbbind_before_connect(void *v, struct in_addr raddr,
+	   u_int rport, struct lwp *l)
+{
+	struct inpcb *inp = v;
+	struct socket *so = inp->inp_socket;
+	struct inpcbtable *table = inp->inp_table;
+	struct sockaddr_in *sin = NULL; /* XXXGCC */
+	u_int16_t lport = 0;
+#ifndef IPNOPRIVPORTS
+	kauth_cred_t cred = l->l_cred;
+#endif
+	int	   cnt;
+	u_int16_t  mymin, mymax;
+	u_int16_t *lastport;
+
+	if (inp->inp_af != AF_INET)
+		return (EINVAL);
+
+	if (TAILQ_FIRST(&in_ifaddrhead) == 0)
+		return (EADDRNOTAVAIL);
+	if (inp->inp_lport || !in_nullhost(inp->inp_laddr))
+		return (EINVAL);
+
+	if (inp->inp_flags & INP_LOWPORT) {
+#ifndef IPNOPRIVPORTS
+		if (kauth_authorize_network(cred,
+					    KAUTH_NETWORK_BIND,
+					    KAUTH_REQ_NETWORK_BIND_PRIVPORT, so,
+					    sin, NULL))
+			return (EACCES);
+#endif
+		mymin = lowportmin;
+		mymax = lowportmax;
+		lastport = &table->inpt_lastlow;
+	} else {
+		mymin = anonportmin;
+		mymax = anonportmax;
+		lastport = &table->inpt_lastport;
+	}
+	if (mymin > mymax) {	/* sanity check */
+		u_int16_t swp;
+
+		swp = mymin;
+		mymin = mymax;
+		mymax = swp;
+	}
+
+	lport = *lastport - 1;
+	for (cnt = mymax - mymin + 1; cnt; cnt--, lport--) {
+		if (lport < mymin || lport > mymax)
+			lport = mymax;
+		if (!in_pcblookup_connect(table, inp->inp_laddr,
+					  htons(lport), raddr, htons(rport)))
+			goto found;
+	}
+	if (!in_nullhost(inp->inp_laddr))
+		inp->inp_laddr.s_addr = INADDR_ANY;
+	return (EAGAIN);
+
+ found:
+	inp->inp_flags |= INP_ANONPORT;
+	*lastport = lport;
+	lport = htons(lport);
+
+	inp->inp_lport = lport;
+	LIST_REMOVE(&inp->inp_head, inph_lhash);
+	LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), &inp->inp_head,
+			 inph_lhash);
+
+	return (0);
+}
+
+void
+in_pcbbind_revert(void *v)
+{
+	struct inpcb *inp = v;
+	struct inpcbtable *table = inp->inp_table;
+
+	/* Called from tcp_usrreq if the connect failed after an
+	 * implicit bind. This will restore the initial state */
+	inp->inp_flags &= ~INP_ANONPORT;
+	inp->inp_lport = 0;
+	LIST_REMOVE(&inp->inp_head, inph_lhash);
+	LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), &inp->inp_head,
+			 inph_lhash);
+}
+
 /*
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
diff --git a/netinet/in_pcb.h b/netinet/in_pcb.h
index 8e1d929..51a0a5c 100644
--- a/netinet/in_pcb.h
+++ b/netinet/in_pcb.h
@@ -125,6 +125,8 @@ struct inpcb {
 void	in_losing(struct inpcb *);
 int	in_pcballoc(struct socket *, void *);
 int	in_pcbbind(void *, struct mbuf *, struct lwp *);
+int	in_pcbbind_before_connect(void *, struct in_addr, u_int, struct lwp *);
+void	in_pcbbind_revert(void *v);
 int	in_pcbconnect(void *, struct mbuf *, struct lwp *);
 void	in_pcbdetach(void *);
 void	in_pcbdisconnect(void *);
diff --git a/netinet/tcp_usrreq.c b/netinet/tcp_usrreq.c
index 46f44c0..b889cdc 100644
--- a/netinet/tcp_usrreq.c
+++ b/netinet/tcp_usrreq.c
@@ -389,11 +389,19 @@ tcp_usrreq(struct socket *so, int req,
 #ifdef INET
 		if (inp) {
 			if (inp->inp_lport == 0) {
-				error = in_pcbbind(inp, (struct mbuf *)0, l);
+				struct sockaddr_in *sin =
+					(struct sockaddr_in *)nam;
+				error = in_pcbbind_before_connect(inp,
+					sin->sin_addr, ntohs(sin->sin_port), l);
 				if (error)
 					break;
 			}
 			error = in_pcbconnect(inp, nam, l);
+			if (error != 0) {
+				/* if connect fails, we need to revert
+				 * bind_before_connect's work */
+				in_pcbbind_revert(inp);
+			}
 		}
 #endif
 #ifdef INET6
-- 
1.7.7.3

Reply via email to