Hi, Attached is a patch that makes my small test program working. I applies to 5.1 and 5.1.1 only. Porting it to current would be a bit harder due to the port randomization, as described by Eric previously.
This is just a proof of concept and I would be happy to have some feedback about how to write it better and what are the potential issues. Olivier
>From 61c4012c89cd088f8f6e3f16f5e1306104232b28 Mon Sep 17 00:00:00 2001 From: Olivier Matz <olivier.m...@6wind.com> Date: Thu, 2 Feb 2012 16:51:05 +0100 Subject: tcp: allow to reuse an ephemeral port if dest addr/port is different When a TCP client calls connect(), an implicit bind is done by the network stack to choose an ephemeral port. Currently, there is a limitation that prevent the tcp client to open many ephemeral ports even if the destination port or address is different. The problem is described in details there: http://mail-index.netbsd.org/tech-kern/2012/01/30/msg012602.html The goal of this patch is to allow duplicate the code of in_pcbbind() in a new function in_pcbbind_before_connect() that is called specifically by the TCP connect code when doing an implicit bind. The behaviour is a bit different compared to the initial in_pcbbind(): - only the (nam == NULL) case is allowed - the function is aware of remote address that will be given to the connect(). The duplication of the ephemeral port is checked by a in_pcblookup_connect() instead of a in_pcblookup_port(). - the socket state is not changed to BOUND (but the the pcb is added in the INPCBHASH_PORT table). The connect() will change the state to CONNECTED if it is successful. If the in_pcbconnect() fails, we need to restore the initial state: inp->in_port to 0, tcp in INPCBHASH_PORT table[0], remove INP_ANONPORT flag. Note: this patch is just a proof of concept and should probably be cleaned and enhanced. Currently, only IPv4 is done. --- netinet/in_pcb.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++ netinet/in_pcb.h | 2 + netinet/tcp_usrreq.c | 10 +++++- 3 files changed, 99 insertions(+), 1 deletions(-) diff --git a/netinet/in_pcb.c b/netinet/in_pcb.c index 5d662ce..498a344 100644 --- a/netinet/in_pcb.c +++ b/netinet/in_pcb.c @@ -371,6 +371,94 @@ noname: return (0); } +int +in_pcbbind_before_connect(void *v, struct in_addr raddr, + u_int rport, struct lwp *l) +{ + struct inpcb *inp = v; + struct socket *so = inp->inp_socket; + struct inpcbtable *table = inp->inp_table; + struct sockaddr_in *sin = NULL; /* XXXGCC */ + u_int16_t lport = 0; +#ifndef IPNOPRIVPORTS + kauth_cred_t cred = l->l_cred; +#endif + int cnt; + u_int16_t mymin, mymax; + u_int16_t *lastport; + + if (inp->inp_af != AF_INET) + return (EINVAL); + + if (TAILQ_FIRST(&in_ifaddrhead) == 0) + return (EADDRNOTAVAIL); + if (inp->inp_lport || !in_nullhost(inp->inp_laddr)) + return (EINVAL); + + if (inp->inp_flags & INP_LOWPORT) { +#ifndef IPNOPRIVPORTS + if (kauth_authorize_network(cred, + KAUTH_NETWORK_BIND, + KAUTH_REQ_NETWORK_BIND_PRIVPORT, so, + sin, NULL)) + return (EACCES); +#endif + mymin = lowportmin; + mymax = lowportmax; + lastport = &table->inpt_lastlow; + } else { + mymin = anonportmin; + mymax = anonportmax; + lastport = &table->inpt_lastport; + } + if (mymin > mymax) { /* sanity check */ + u_int16_t swp; + + swp = mymin; + mymin = mymax; + mymax = swp; + } + + lport = *lastport - 1; + for (cnt = mymax - mymin + 1; cnt; cnt--, lport--) { + if (lport < mymin || lport > mymax) + lport = mymax; + if (!in_pcblookup_connect(table, inp->inp_laddr, + htons(lport), raddr, htons(rport))) + goto found; + } + if (!in_nullhost(inp->inp_laddr)) + inp->inp_laddr.s_addr = INADDR_ANY; + return (EAGAIN); + + found: + inp->inp_flags |= INP_ANONPORT; + *lastport = lport; + lport = htons(lport); + + inp->inp_lport = lport; + LIST_REMOVE(&inp->inp_head, inph_lhash); + LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), &inp->inp_head, + inph_lhash); + + return (0); +} + +void +in_pcbbind_revert(void *v) +{ + struct inpcb *inp = v; + struct inpcbtable *table = inp->inp_table; + + /* Called from tcp_usrreq if the connect failed after an + * implicit bind. This will restore the initial state */ + inp->inp_flags &= ~INP_ANONPORT; + inp->inp_lport = 0; + LIST_REMOVE(&inp->inp_head, inph_lhash); + LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), &inp->inp_head, + inph_lhash); +} + /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. diff --git a/netinet/in_pcb.h b/netinet/in_pcb.h index 8e1d929..51a0a5c 100644 --- a/netinet/in_pcb.h +++ b/netinet/in_pcb.h @@ -125,6 +125,8 @@ struct inpcb { void in_losing(struct inpcb *); int in_pcballoc(struct socket *, void *); int in_pcbbind(void *, struct mbuf *, struct lwp *); +int in_pcbbind_before_connect(void *, struct in_addr, u_int, struct lwp *); +void in_pcbbind_revert(void *v); int in_pcbconnect(void *, struct mbuf *, struct lwp *); void in_pcbdetach(void *); void in_pcbdisconnect(void *); diff --git a/netinet/tcp_usrreq.c b/netinet/tcp_usrreq.c index 46f44c0..b889cdc 100644 --- a/netinet/tcp_usrreq.c +++ b/netinet/tcp_usrreq.c @@ -389,11 +389,19 @@ tcp_usrreq(struct socket *so, int req, #ifdef INET if (inp) { if (inp->inp_lport == 0) { - error = in_pcbbind(inp, (struct mbuf *)0, l); + struct sockaddr_in *sin = + (struct sockaddr_in *)nam; + error = in_pcbbind_before_connect(inp, + sin->sin_addr, ntohs(sin->sin_port), l); if (error) break; } error = in_pcbconnect(inp, nam, l); + if (error != 0) { + /* if connect fails, we need to revert + * bind_before_connect's work */ + in_pcbbind_revert(inp); + } } #endif #ifdef INET6 -- 1.7.7.3