On Tue, Aug 06, 2013 at 09:24:13PM +0200, Christopher Zimmermann wrote:
> Hi,
>
> I'm currently working towards an IP_SENDSRCADDR implementation.
> As a first step I moved the calls to in_pcbrehash() from
> in_pcb(dis)connect() and in_pcbbind() to the call sites of those
> functions. This should save some calls to in_pcbrehash() in the
> in_pcbconnect() codepath.
> Also I improved code sharing between the IPv4 and IPv6 stacks by
> implementing a generic in_pcbsetport used by both stacks.
> The next step will be to replace in_pcbconnect() by in_selectsrc()
> and in_pcbsetport() in udp_output() like the IPv6 stack does it.
> The splsoftnet() lock could then be removed. Also implementing
> IP_SENDSRCADDR will become trivial because in_selectsrc() won't
> modify the struct inpcb like in_pcbconnect() does at the moment.
>
> Not much testing yet. Just surfing the web and sending this mail...
>
> I'd really like some feedback on whether I'm on the right track or
> doing something stupid here. I have no experience in the OpenBSD
> IP-stack yet.
>
>
Moving the calls of of in_pcbrehash() to outside of in_pcb.c is wrong.
This is an internal function for the specific way fast PCB lookups are
done at the moment. Having that all over the place in higher level code
where someone does a bind or connect call is not an option IMO.
This needs to be done differently, sorry.
--
:wq Claudio
> Christopher
>
>
>
> Index: netinet/in_pcb.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/in_pcb.c,v
> retrieving revision 1.139
> diff -u -p -r1.139 in_pcb.c
> --- netinet/in_pcb.c 1 Jun 2013 13:25:40 -0000 1.139
> +++ netinet/in_pcb.c 6 Aug 2013 18:25:54 -0000
> @@ -218,7 +218,6 @@ in_pcbbind(struct inpcb *inp, struct mbu
> {
> struct socket *so = inp->inp_socket;
> struct inpcbtable *table = inp->inp_table;
> - u_int16_t *lastport = &inp->inp_table->inpt_lastport;
> struct sockaddr_in *sin;
> u_int16_t lport = 0;
> int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
> @@ -293,71 +292,121 @@ in_pcbbind(struct inpcb *inp, struct mbu
> inp->inp_laddr = sin->sin_addr;
> }
> if (lport == 0) {
> - u_int16_t first, last;
> - int count;
> + error = in_pcbsetport(inp, p);
> + if (error != 0)
> + return error;
> + }
> + else
> + inp->inp_lport = lport;
> + return (0);
> +}
>
> - if (inp->inp_flags & INP_HIGHPORT) {
> - first = ipport_hifirstauto; /* sysctl */
> - last = ipport_hilastauto;
> - } else if (inp->inp_flags & INP_LOWPORT) {
> - if ((error = suser(p, 0)))
> - return (EACCES);
> - first = IPPORT_RESERVED-1; /* 1023 */
> - last = 600; /* not IPPORT_RESERVED/2 */
> - } else {
> - first = ipport_firstauto; /* sysctl */
> - last = ipport_lastauto;
> - }
> +int
> +in_pcbsetport(struct inpcb *inp, struct proc *p)
> +{
> + struct socket *so = inp->inp_socket;
> + struct inpcbtable *table = inp->inp_table;
> + u_int16_t first, last;
> + u_int16_t *lastport = &inp->inp_table->inpt_lastport;
> + u_int16_t lport = 0;
> + int count;
> + int error;
> + int wild;
> + u_int rtableid;
> + void *faddrp, *laddrp;
> +#ifdef INET6
> + extern struct in6_addr zeroin6_addr;
>
> - /*
> - * Simple check to ensure all ports are not used up causing
> - * a deadlock here.
> - *
> - * We split the two cases (up and down) so that the direction
> - * is not being tested on each round of the loop.
> - */
> + if (sotopf(so) == PF_INET6) {
> + wild = INPLOOKUP_IPV6;
> + rtableid = 0;
> + faddrp = &zeroin6_addr;
> + laddrp = &inp->inp_laddr6;
> + }
> + else
> +#endif /* INET6 */
> + {
> + wild = 0;
> + rtableid = inp->inp_rtableid;
> + faddrp = &zeroin_addr;
> + laddrp = &inp->inp_laddr;
> + }
>
> - if (first > last) {
> - /*
> - * counting down
> - */
> - count = first - last;
> - if (count)
> - *lastport = first - arc4random_uniform(count);
> + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 &&
> + ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 ||
> + (so->so_options & SO_ACCEPTCONN) == 0))
> + wild |= INPLOOKUP_WILDCARD;
>
> - do {
> - if (count-- < 0) /* completely used? */
> - return (EADDRNOTAVAIL);
> - --*lastport;
> - if (*lastport > first || *lastport < last)
> - *lastport = first;
> - lport = htons(*lastport);
> - } while (in_baddynamic(*lastport,
> so->so_proto->pr_protocol) ||
> - in_pcblookup(table, &zeroin_addr, 0,
> - &inp->inp_laddr, lport, wild, inp->inp_rtableid));
> - } else {
> - /*
> - * counting up
> - */
> - count = last - first;
> - if (count)
> - *lastport = first + arc4random_uniform(count);
> + if (inp->inp_flags & INP_HIGHPORT) {
> + first = ipport_hifirstauto; /* sysctl */
> + last = ipport_hilastauto;
> + } else if (inp->inp_flags & INP_LOWPORT) {
> + if ((error = suser(p, 0)))
> + return (EACCES);
> + first = IPPORT_RESERVED-1; /* 1023 */
> + last = 600; /* not IPPORT_RESERVED/2 */
> + } else {
> + first = ipport_firstauto; /* sysctl */
> + last = ipport_lastauto;
> + }
>
> - do {
> - if (count-- < 0) /* completely used? */
> - return (EADDRNOTAVAIL);
> - ++*lastport;
> - if (*lastport < first || *lastport > last)
> - *lastport = first;
> - lport = htons(*lastport);
> - } while (in_baddynamic(*lastport,
> so->so_proto->pr_protocol) ||
> - in_pcblookup(table, &zeroin_addr, 0,
> - &inp->inp_laddr, lport, wild, inp->inp_rtableid));
> - }
> + /*
> + * Simple check to ensure all ports are not used up causing
> + * a deadlock here.
> + *
> + * We split the two cases (up and down) so that the direction
> + * is not being tested on each round of the loop.
> + * XXX: Does this optimization really have a relevant effect?
> + * How many iterations can we expect?
> + * What about branch prediction?
> + * Shouldn't we forbid the first > last case completely?
> + */
> +
> + if (first > last) {
> + /*
> + * counting down
> + */
> + count = first - last;
> + if (count)
> + *lastport = first - arc4random_uniform(count);
> +
> + do {
> + if (count-- < 0) /* completely used? */
> + return (EADDRNOTAVAIL);
> + --*lastport;
> + if (*lastport > first || *lastport < last)
> + *lastport = first;
> + lport = htons(*lastport);
> + } while (in_baddynamic(*lastport, so->so_proto->pr_protocol) ||
> + in_pcblookup(table, faddrp, 0,
> + laddrp, lport, wild, rtableid));
> + } else {
> + /*
> + * counting up
> + */
> + count = last - first;
> + if (count)
> + *lastport = first + arc4random_uniform(count);
> +
> + do {
> + if (count-- < 0) /* completely used? */
> + return (EADDRNOTAVAIL);
> + ++*lastport;
> + if (*lastport < first || *lastport > last)
> + *lastport = first;
> + lport = htons(*lastport);
> + } while (in_baddynamic(*lastport, so->so_proto->pr_protocol) ||
> + in_pcblookup(table, faddrp, 0,
> + laddrp, lport, wild, rtableid));
> }
> +
> inp->inp_lport = lport;
> - in_pcbrehash(inp);
> - return (0);
> +
> +#if 0
> + inp->inp_flowinfo = 0; /* XXX */
> +#endif
> +
> + return 0;
> }
>
> /*
> @@ -418,13 +467,12 @@ in_pcbconnect(struct inpcb *inp, struct
> return (EADDRINUSE);
> if (inp->inp_laddr.s_addr == INADDR_ANY) {
> if (inp->inp_lport == 0 &&
> - in_pcbbind(inp, NULL, curproc) == EADDRNOTAVAIL)
> - return (EADDRNOTAVAIL);
> + in_pcbsetport(inp, curproc) == EADDRNOTAVAIL)
> + return EADDRNOTAVAIL;
> inp->inp_laddr = ifaddr->sin_addr;
> }
> inp->inp_faddr = sin->sin_addr;
> inp->inp_fport = sin->sin_port;
> - in_pcbrehash(inp);
> #ifdef IPSEC
> {
> int error; /* This is just ignored */
> @@ -452,7 +500,6 @@ in_pcbdisconnect(struct inpcb *inp)
> }
>
> inp->inp_fport = 0;
> - in_pcbrehash(inp);
> if (inp->inp_socket->so_state & SS_NOFDREF)
> in_pcbdetach(inp);
> }
> Index: netinet/in_pcb.h
> ===================================================================
> RCS file: /cvs/src/sys/netinet/in_pcb.h,v
> retrieving revision 1.79
> diff -u -p -r1.79 in_pcb.h
> --- netinet/in_pcb.h 31 May 2013 13:15:53 -0000 1.79
> +++ netinet/in_pcb.h 6 Aug 2013 18:25:54 -0000
> @@ -297,6 +297,6 @@ int in6_pcbnotify(struct inpcbtable *, s
> u_int, const struct sockaddr_in6 *, u_int, int, void *,
> void (*)(struct inpcb *, int));
> int in6_selecthlim(struct inpcb *, struct ifnet *);
> -int in6_pcbsetport(struct in6_addr *, struct inpcb *, struct proc *);
> +int in_pcbsetport(struct inpcb *, struct proc *);
> #endif /* _KERNEL */
> #endif /* _NETINET_IN_PCB_H_ */
> Index: netinet/ip_divert.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/ip_divert.c,v
> retrieving revision 1.13
> diff -u -p -r1.13 ip_divert.c
> --- netinet/ip_divert.c 8 Apr 2013 15:32:23 -0000 1.13
> +++ netinet/ip_divert.c 6 Aug 2013 18:25:55 -0000
> @@ -319,6 +319,8 @@ divert_usrreq(struct socket *so, int req
> case PRU_BIND:
> s = splsoftnet();
> error = in_pcbbind(inp, addr, p);
> + if (error == 0)
> + in_pcbrehash(inp);
> splx(s);
> break;
>
> Index: netinet/tcp_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_input.c,v
> retrieving revision 1.265
> diff -u -p -r1.265 tcp_input.c
> --- netinet/tcp_input.c 1 Jul 2013 10:53:52 -0000 1.265
> +++ netinet/tcp_input.c 6 Aug 2013 18:26:01 -0000
> @@ -3820,6 +3820,7 @@ syn_cache_get(struct sockaddr *src, stru
> break;
> #endif
> }
> + in_pcbrehash(inp);
> (void) m_free(am);
>
> tp = intotcpcb(inp);
> Index: netinet/tcp_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v
> retrieving revision 1.112
> diff -u -p -r1.112 tcp_usrreq.c
> --- netinet/tcp_usrreq.c 17 May 2013 09:04:30 -0000 1.112
> +++ netinet/tcp_usrreq.c 6 Aug 2013 18:26:02 -0000
> @@ -228,6 +228,7 @@ tcp_usrreq(so, req, m, nam, control, p)
> error = in_pcbbind(inp, nam, p);
> if (error)
> break;
> + in_pcbrehash(inp);
> break;
>
> /*
> @@ -244,8 +245,10 @@ tcp_usrreq(so, req, m, nam, control, p)
> }
> /* If the in_pcbbind() above is called, the tp->pf
> should still be whatever it was before. */
> - if (error == 0)
> + if (error == 0) {
> + in_pcbrehash(inp);
> tp->t_state = TCPS_LISTEN;
> + }
> break;
>
> /*
> @@ -304,6 +307,8 @@ tcp_usrreq(so, req, m, nam, control, p)
> error = ENOBUFS;
> break;
> }
> +
> + in_pcbrehash(inp);
>
> so->so_state |= SS_CONNECTOUT;
>
> Index: netinet/udp_usrreq.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/udp_usrreq.c,v
> retrieving revision 1.164
> diff -u -p -r1.164 udp_usrreq.c
> --- netinet/udp_usrreq.c 9 Jun 2013 22:03:06 -0000 1.164
> +++ netinet/udp_usrreq.c 6 Aug 2013 18:26:04 -0000
> @@ -1164,6 +1164,8 @@ udp_usrreq(struct socket *so, int req, s
> else
> #endif
> error = in_pcbbind(inp, addr, p);
> + if (error == 0)
> + in_pcbrehash(inp);
> splx(s);
> break;
>
> @@ -1180,7 +1182,6 @@ udp_usrreq(struct socket *so, int req, s
> }
> s = splsoftnet();
> error = in6_pcbconnect(inp, addr);
> - splx(s);
> } else
> #endif /* INET6 */
> {
> @@ -1190,8 +1191,9 @@ udp_usrreq(struct socket *so, int req, s
> }
> s = splsoftnet();
> error = in_pcbconnect(inp, addr);
> - splx(s);
> }
> + in_pcbrehash(inp);
> + splx(s);
>
> if (error == 0)
> soisconnected(so);
> @@ -1229,6 +1231,7 @@ udp_usrreq(struct socket *so, int req, s
> #endif /* INET6 */
> inp->inp_laddr.s_addr = INADDR_ANY;
> in_pcbdisconnect(inp);
> + in_pcbrehash(inp);
>
> splx(s);
> so->so_state &= ~SS_ISCONNECTED; /* XXX */
> Index: netinet6/in6_pcb.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/in6_pcb.c,v
> retrieving revision 1.56
> diff -u -p -r1.56 in6_pcb.c
> --- netinet6/in6_pcb.c 31 May 2013 15:04:24 -0000 1.56
> +++ netinet6/in6_pcb.c 6 Aug 2013 18:26:05 -0000
> @@ -267,8 +267,8 @@ in6_pcbbind(struct inpcb *inp, struct mb
> return error;
>
> t = in_pcblookup(head,
> - (struct in_addr *)&zeroin6_addr, 0,
> - (struct in_addr *)&sin6->sin6_addr, lport,
> + &zeroin6_addr, 0,
> + &sin6->sin6_addr, lport,
> wild, /* XXX */ 0);
>
> if (t && (reuseport & t->inp_socket->so_options) == 0)
> @@ -278,101 +278,11 @@ in6_pcbbind(struct inpcb *inp, struct mb
> }
>
> if (lport == 0) {
> - error = in6_pcbsetport(&inp->inp_laddr6, inp, p);
> + error = in_pcbsetport(inp, p);
> if (error != 0)
> return error;
> - } else {
> + } else
> inp->inp_lport = lport;
> - in_pcbrehash(inp);
> - }
> -
> - return 0;
> -}
> -
> -int
> -in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct proc *p)
> -{
> - struct socket *so = inp->inp_socket;
> - struct inpcbtable *table = inp->inp_table;
> - u_int16_t first, last;
> - u_int16_t *lastport = &inp->inp_table->inpt_lastport;
> - u_int16_t lport = 0;
> - int count;
> - int wild = INPLOOKUP_IPV6;
> - int error;
> -
> - /* XXX we no longer support IPv4 mapped address, so no tweaks here */
> -
> - if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 &&
> - ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 ||
> - (so->so_options & SO_ACCEPTCONN) == 0))
> - wild |= INPLOOKUP_WILDCARD;
> -
> - if (inp->inp_flags & INP_HIGHPORT) {
> - first = ipport_hifirstauto; /* sysctl */
> - last = ipport_hilastauto;
> - } else if (inp->inp_flags & INP_LOWPORT) {
> - if ((error = suser(p, 0)))
> - return (EACCES);
> - first = IPPORT_RESERVED-1; /* 1023 */
> - last = 600; /* not IPPORT_RESERVED/2 */
> - } else {
> - first = ipport_firstauto; /* sysctl */
> - last = ipport_lastauto;
> - }
> -
> - /*
> - * Simple check to ensure all ports are not used up causing
> - * a deadlock here.
> - *
> - * We split the two cases (up and down) so that the direction
> - * is not being tested on each round of the loop.
> - */
> -
> - if (first > last) {
> - /*
> - * counting down
> - */
> - count = first - last;
> - if (count)
> - *lastport = first - arc4random_uniform(count);
> -
> - do {
> - if (count-- < 0) /* completely used? */
> - return (EADDRNOTAVAIL);
> - --*lastport;
> - if (*lastport > first || *lastport < last)
> - *lastport = first;
> - lport = htons(*lastport);
> - } while (in_baddynamic(*lastport, so->so_proto->pr_protocol) ||
> - in_pcblookup(table, &zeroin6_addr, 0,
> - &inp->inp_laddr6, lport, wild, /* XXX */ 0));
> - } else {
> - /*
> - * counting up
> - */
> - count = last - first;
> - if (count)
> - *lastport = first + arc4random_uniform(count);
> -
> - do {
> - if (count-- < 0) /* completely used? */
> - return (EADDRNOTAVAIL);
> - ++*lastport;
> - if (*lastport < first || *lastport > last)
> - *lastport = first;
> - lport = htons(*lastport);
> - } while (in_baddynamic(*lastport, so->so_proto->pr_protocol) ||
> - in_pcblookup(table, &zeroin6_addr, 0,
> - &inp->inp_laddr6, lport, wild, /* XXX */ 0));
> - }
> -
> - inp->inp_lport = lport;
> - in_pcbrehash(inp);
> -
> -#if 0
> - inp->inp_flowinfo = 0; /* XXX */
> -#endif
>
> return 0;
> }
> @@ -449,8 +359,9 @@ in6_pcbconnect(struct inpcb *inp, struct
> return (EADDRINUSE);
> }
> if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6)) {
> - if (inp->inp_lport == 0)
> - (void)in6_pcbbind(inp, NULL, curproc);
> + if (inp->inp_lport == 0 &&
> + in_pcbsetport(inp, curproc) == EADDRNOTAVAIL)
> + return EADDRNOTAVAIL;
> inp->inp_laddr6 = *in6a;
> }
> inp->inp_faddr6 = sin6->sin6_addr;
> @@ -459,7 +370,6 @@ in6_pcbconnect(struct inpcb *inp, struct
> if (ip6_auto_flowlabel)
> inp->inp_flowinfo |=
> (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
> - in_pcbrehash(inp);
> return (0);
> }
>
> Index: netinet6/ip6_divert.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/ip6_divert.c,v
> retrieving revision 1.13
> diff -u -p -r1.13 ip6_divert.c
> --- netinet6/ip6_divert.c 26 Jun 2013 09:12:40 -0000 1.13
> +++ netinet6/ip6_divert.c 6 Aug 2013 18:26:05 -0000
> @@ -314,6 +314,8 @@ divert6_usrreq(struct socket *so, int re
> case PRU_BIND:
> s = splsoftnet();
> error = in6_pcbbind(inp, addr, p);
> + if (error == 0)
> + in_pcbrehash(inp);
> splx(s);
> break;
>
> Index: netinet6/udp6_output.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/udp6_output.c,v
> retrieving revision 1.19
> diff -u -p -r1.19 udp6_output.c
> --- netinet6/udp6_output.c 28 Mar 2013 16:45:16 -0000 1.19
> +++ netinet6/udp6_output.c 6 Aug 2013 18:26:05 -0000
> @@ -194,9 +194,12 @@ udp6_output(struct in6pcb *in6p, struct
> error = EADDRNOTAVAIL;
> goto release;
> }
> - if (in6p->in6p_lport == 0 &&
> - (error = in6_pcbsetport(laddr, in6p, p)) != 0)
> - goto release;
> + if (in6p->in6p_lport == 0) {
> + error = in_pcbsetport(in6p, p);
> + if (error)
> + goto release;
> + in_pcbrehash(in6p);
> + }
> } else {
> if (IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
> error = ENOTCONN;
>
>
> --
> http://gmerlin.de
> OpenPGP: http://gmerlin.de/christopher.pub
> 1917 680A 723C BF3D 2CA3 0E44 7E24 D19F 34B8 2A2A