This patch enables RDS to use IPv6 addresses. For RDS/TCP, the listener is now an IPv6 endpoint which accepts both IPv4 and IPv6 connection requests. RDS/RDMA/IB uses a private data (struct rds_ib_connect_private) exchange between endpoints at RDS connection establishment time to support RDMA. This private data exchange uses a 32 bit integer to represent an IP address. This needs to be changed in order to support IPv6. A new private data struct rds6_ib_connect_private is introduced to handle this. To ensure backward compatibility, an IPv6 capable RDS stack uses another RDMA listener port (RDS_CM_PORT) to accept IPv6 connection. And it continues to use the original RDS_PORT for IPv4 RDS connections. When it needs to communicate with an IPv6 peer, it uses the RDS_CM_PORT to send the connection set up request.
Signed-off-by: Ka-Cheong Poon <ka-cheong.p...@oracle.com> --- net/rds/bind.c | 21 +++++++++++++++--- net/rds/connection.c | 43 ++++++++++++++++++++++++------------- net/rds/ib.c | 55 +++++++++++++++++++++++++++++++++++++++++------- net/rds/ib_cm.c | 15 +++++++------ net/rds/rdma_transport.c | 32 ++++++++++++++++++++++++++-- net/rds/rdma_transport.h | 2 ++ net/rds/rds.h | 12 ++++++----- net/rds/send.c | 23 ++++++++++++++++++-- net/rds/tcp.c | 54 +++++++++++++++++++++++++++++------------------ net/rds/tcp.h | 4 +--- net/rds/tcp_connect.c | 54 ++++++++++++++++++++++++++++++++++++----------- net/rds/tcp_listen.c | 40 +++++++++++++++++++++++++++-------- 12 files changed, 269 insertions(+), 86 deletions(-) diff --git a/net/rds/bind.c b/net/rds/bind.c index 3a1097e..4c2bf9c 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -164,11 +164,12 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct in6_addr v6addr, *binding_addr; struct rds_transport *trans; __u32 scope_id = 0; + int addr_type; int ret = 0; __be16 port; - /* We only allow an RDS socket to be bound to and IPv4 address. IPv6 - * address support will be added later. + /* We allow an RDS socket to be bound to either IPv4 or IPv6 + * address. */ if (addr_len == sizeof(struct sockaddr_in)) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; @@ -180,7 +181,21 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) binding_addr = &v6addr; port = sin->sin_port; } else if (addr_len == sizeof(struct sockaddr_in6)) { - return -EPROTONOSUPPORT; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr; + + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (sin6->sin6_family != AF_INET6 || + !(addr_type & IPV6_ADDR_UNICAST)) { + return -EINVAL; + } + /* The scope ID must be specified for link local address. */ + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (sin6->sin6_scope_id == 0) + return -EINVAL; + scope_id = sin6->sin6_scope_id; + } + binding_addr = &sin6->sin6_addr; + port = sin6->sin6_port; } else { return -EINVAL; } diff --git a/net/rds/connection.c b/net/rds/connection.c index ca72563..8c5d093 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -486,10 +486,17 @@ void rds_conn_destroy(struct rds_connection *conn) } EXPORT_SYMBOL_GPL(rds_conn_destroy); -static void rds_conn_message_info(struct socket *sock, unsigned int len, - struct rds_info_iterator *iter, - struct rds_info_lengths *lens, - int want_send) +static void __rds_inc_msg_cp(struct rds_incoming *inc, + struct rds_info_iterator *iter, + void *saddr, void *daddr, int flip) +{ + rds_inc_info_copy(inc, iter, *(__be32 *)saddr, *(__be32 *)daddr, flip); +} + +static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) { struct hlist_head *head; struct list_head *list; @@ -524,18 +531,13 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { - __be32 laddr; - __be32 faddr; - total++; - laddr = conn->c_laddr.s6_addr32[3]; - faddr = conn->c_faddr.s6_addr32[3]; if (total <= len) - rds_inc_info_copy(&rm->m_inc, - iter, - laddr, - faddr, - 0); + __rds_inc_msg_cp(&rm->m_inc, + iter, + &conn->c_laddr, + &conn->c_faddr, + 0); } spin_unlock_irqrestore(&cp->cp_lock, flags); @@ -548,6 +550,14 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, lens->each = sizeof(struct rds_info_message); } +static void rds_conn_message_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) +{ + rds_conn_message_info_cmn(sock, len, iter, lens, want_send); +} + static void rds_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -655,6 +665,9 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) struct rds_info_connection *cinfo = buffer; struct rds_connection *conn = cp->cp_conn; + if (conn->c_isv6) + return 0; + cinfo->next_tx_seq = cp->cp_next_tx_seq; cinfo->next_rx_seq = cp->cp_next_rx_seq; cinfo->laddr = conn->c_laddr.s6_addr32[3]; diff --git a/net/rds/ib.c b/net/rds/ib.c index c712a84..756225c 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,6 +39,7 @@ #include <linux/delay.h> #include <linux/slab.h> #include <linux/module.h> +#include <net/addrconf.h> #include "rds_single_path.h" #include "rds.h" @@ -295,6 +296,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) return 0; + if (conn->c_isv6) + return 0; iinfo->src_addr = conn->c_laddr.s6_addr32[3]; iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; @@ -330,7 +333,6 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_rdma_connection)); } - /* * Early RDS/IB was built to only bind to an address if there is an IPoIB * device with that address set. @@ -346,8 +348,12 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, { int ret; struct rdma_cm_id *cm_id; + struct sockaddr_in6 sin6; struct sockaddr_in sin; + struct sockaddr *sa; + bool isv4; + isv4 = ipv6_addr_v4mapped(addr); /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ @@ -356,20 +362,53 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, if (IS_ERR(cm_id)) return PTR_ERR(cm_id); - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr->s6_addr32[3]; + if (isv4) { + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr->s6_addr32[3]; + sa = (struct sockaddr *)&sin; + } else { + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = *addr; + sin6.sin6_scope_id = scope_id; + sa = (struct sockaddr *)&sin6; + + /* XXX Do a special IPv6 link local address check here. The + * reason is that rdma_bind_addr() always succeeds with IPv6 + * link local address regardless it is indeed configured in a + * system. + */ + if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) { + struct net_device *dev; + + if (scope_id == 0) + return -EADDRNOTAVAIL; + + /* Use init_net for now as RDS is not network + * name space aware. + */ + dev = dev_get_by_index(&init_net, scope_id); + if (!dev) + return -EADDRNOTAVAIL; + if (!ipv6_chk_addr(&init_net, addr, dev, 1)) { + dev_put(dev); + return -EADDRNOTAVAIL; + } + dev_put(dev); + } + } /* rdma_bind_addr will only succeed for IB & iWARP devices */ - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + ret = rdma_bind_addr(cm_id, sa); /* due to this, we will claim to support iWARP devices unless we check node_type. */ if (ret || !cm_id->device || cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; - rdsdebug("addr %pI6c ret %d node type %d\n", - addr, ret, + rdsdebug("addr %pI6c%%%u ret %d node type %d\n", + addr, scope_id, ret, cm_id->device ? cm_id->device->node_type : -1); rdma_destroy_id(cm_id); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 31ffa70..03279f3 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -40,7 +40,6 @@ #include "rds_single_path.h" #include "rds.h" #include "ib.h" -#include "tcp.h" /* * Set the selected protocol version @@ -679,7 +678,7 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6) return version; } -/* Given an IPv6 address, find the IB net_device which hosts that address and +/* Given an IPv6 address, find the net_device which hosts that address and * return its index. This is used by the rds_ib_cm_handle_connect() code to * find the interface index of where an incoming request comes from when * the request is using a link local address. @@ -696,8 +695,7 @@ static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr) rcu_read_lock(); for_each_netdev_rcu(net, dev) { - if (dev->type == ARPHRD_INFINIBAND && - ipv6_chk_addr(net, addr, dev, 0)) { + if (ipv6_chk_addr(net, addr, dev, 0)) { idx = dev->ifindex; break; } @@ -887,7 +885,10 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ - handler = rds_rdma_cm_event_handler; + if (conn->c_isv6) + handler = rds6_rdma_cm_event_handler; + else + handler = rds_rdma_cm_event_handler; ic->i_cm_id = rdma_create_id(&init_net, handler, conn, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { @@ -923,7 +924,7 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp) sin6 = (struct sockaddr_in6 *)&dest; sin6->sin6_family = AF_INET6; sin6->sin6_addr = conn->c_faddr; - sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); + sin6->sin6_port = (__force u16)htons(RDS_CM_PORT); sin6->sin6_scope_id = conn->c_dev_if; } diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index d7da115..6a696b8 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -37,7 +37,9 @@ #include "rdma_transport.h" #include "ib.h" +/* Global IPv4 and IPv6 RDS RDMA listener cm_id */ static struct rdma_cm_id *rds_rdma_listen_id; +static struct rdma_cm_id *rds6_rdma_listen_id; int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, struct rdma_cm_event *event, @@ -153,6 +155,12 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, return rds_rdma_cm_event_handler_cmn(cm_id, event, false); } +int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + return rds_rdma_cm_event_handler_cmn(cm_id, event, true); +} + static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, struct sockaddr *sa, struct rdma_cm_id **ret_cm_id) @@ -199,13 +207,14 @@ static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, /* Initialize the RDS RDMA listeners. We create two listeners for * compatibility reason. The one on RDS_PORT is used for IPv4 - * requests only. The one on RDS_TCP_PORT is used for IPv6 requests + * requests only. The one on RDS_CM_PORT is used for IPv6 requests * only. So only IPv6 enabled RDS module will communicate using this * port. */ static int rds_rdma_listen_init(void) { int ret; + struct sockaddr_in6 sin6; struct sockaddr_in sin; sin.sin_family = PF_INET; @@ -214,7 +223,21 @@ static int rds_rdma_listen_init(void) ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler, (struct sockaddr *)&sin, &rds_rdma_listen_id); - return ret; + if (ret != 0) + return ret; + + sin6.sin6_family = PF_INET6; + sin6.sin6_addr = in6addr_any; + sin6.sin6_port = htons(RDS_CM_PORT); + sin6.sin6_scope_id = 0; + sin6.sin6_flowinfo = 0; + ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler, + (struct sockaddr *)&sin6, + &rds6_rdma_listen_id); + /* Keep going even when IPv6 is not enabled in the system. */ + if (ret != 0) + rdsdebug("Cannot set up IPv6 RDMA listener\n"); + return 0; } static void rds_rdma_listen_stop(void) @@ -224,6 +247,11 @@ static void rds_rdma_listen_stop(void) rdma_destroy_id(rds_rdma_listen_id); rds_rdma_listen_id = NULL; } + if (rds6_rdma_listen_id) { + rdsdebug("cm %p\n", rds6_rdma_listen_id); + rdma_destroy_id(rds6_rdma_listen_id); + rds6_rdma_listen_id = NULL; + } } static int rds_rdma_init(void) diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index d309c44..bc3c639 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -11,6 +11,8 @@ int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); +int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); /* from ib.c */ extern struct rds_transport rds_ib_transport; diff --git a/net/rds/rds.h b/net/rds/rds.h index 859808a..f5f99d1 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -24,13 +24,15 @@ #define RDS_PROTOCOL_MINOR(v) ((v) & 255) #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) -/* - * XXX randomly chosen, but at least seems to be unused: - * # 18464-18768 Unassigned - * We should do better. We want a reserved port to discourage unpriv'ed - * userspace from listening. +/* The following ports, 16385, 18634, 18635, are registered with IANA as + * the ports to be used for RDS over TCP and UDP. 18634 is the historical + * value used for the RDMA_CM listener port. RDS/TCP uses port 16385. After + * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept + * to ensure compatibility with older RDS modules. */ #define RDS_PORT 18634 +#define RDS_CM_PORT 16385 +#define RDS_TCP_PORT RDS_CM_PORT #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 diff --git a/net/rds/send.c b/net/rds/send.c index cc91860..3bc806b 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1105,8 +1105,27 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) break; case sizeof(*sin6): { - ret = -EPROTONOSUPPORT; - goto out; + int addr_type; + + if (sin6->sin6_family != AF_INET6) { + ret = -EINVAL; + goto out; + } + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (!(addr_type & IPV6_ADDR_UNICAST)) { + ret = -EINVAL; + goto out; + } + if (addr_type & IPV6_ADDR_LINKLOCAL && + sin6->sin6_scope_id == 0) { + ret = -EINVAL; + goto out; + } + + daddr = sin6->sin6_addr; + dport = sin6->sin6_port; + scope_id = sin6->sin6_scope_id; + break; } default: diff --git a/net/rds/tcp.c b/net/rds/tcp.c index beaff17..fb0dac1 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -46,7 +46,12 @@ /* only for info exporting */ static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); static LIST_HEAD(rds_tcp_tc_list); + +/* rds_tcp_tc_count counts only IPv4 connections. + * rds6_tcp_tc_count counts both IPv4 and IPv6 connections. + */ static unsigned int rds_tcp_tc_count; +static unsigned int rds6_tcp_tc_count; /* Track rds_tcp_connection structs so they can be cleaned up */ static DEFINE_SPINLOCK(rds_tcp_conn_lock); @@ -113,7 +118,9 @@ void rds_tcp_restore_callbacks(struct socket *sock, /* done under the callback_lock to serialize with write_space */ spin_lock(&rds_tcp_tc_list_lock); list_del_init(&tc->t_list_item); - rds_tcp_tc_count--; + rds6_tcp_tc_count--; + if (!tc->t_cpath->cp_conn->c_isv6) + rds_tcp_tc_count--; spin_unlock(&rds_tcp_tc_list_lock); tc->t_sock = NULL; @@ -200,7 +207,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) /* done under the callback_lock to serialize with write_space */ spin_lock(&rds_tcp_tc_list_lock); list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); - rds_tcp_tc_count++; + rds6_tcp_tc_count++; + if (!tc->t_cpath->cp_conn->c_isv6) + rds_tcp_tc_count++; spin_unlock(&rds_tcp_tc_list_lock); /* accepted sockets need our listen data ready undone */ @@ -221,6 +230,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) write_unlock_bh(&sock->sk->sk_callback_lock); } +/* Handle RDS_INFO_TCP_SOCKETS socket option. It only returns IPv4 + * connections for backward compatibility. + */ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -228,8 +240,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, struct rds_info_tcp_socket tsinfo; struct rds_tcp_connection *tc; unsigned long flags; - struct sockaddr_in sin; - struct socket *sock; spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); @@ -237,16 +247,15 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, goto out; list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { + struct inet_sock *inet = inet_sk(tc->t_sock->sk); - sock = tc->t_sock; - if (sock) { - sock->ops->getname(sock, (struct sockaddr *)&sin, 0); - tsinfo.local_addr = sin.sin_addr.s_addr; - tsinfo.local_port = sin.sin_port; - sock->ops->getname(sock, (struct sockaddr *)&sin, 1); - tsinfo.peer_addr = sin.sin_addr.s_addr; - tsinfo.peer_port = sin.sin_port; - } + if (tc->t_cpath->cp_conn->c_isv6) + continue; + + tsinfo.local_addr = inet->inet_saddr; + tsinfo.local_port = inet->inet_sport; + tsinfo.peer_addr = inet->inet_daddr; + tsinfo.peer_port = inet->inet_dport; tsinfo.hdr_rem = tc->t_tinc_hdr_rem; tsinfo.data_rem = tc->t_tinc_data_rem; @@ -495,13 +504,18 @@ static __net_init int rds_tcp_init_net(struct net *net) err = -ENOMEM; goto fail; } - rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true); if (!rtn->rds_tcp_listen_sock) { - pr_warn("could not set up listen sock\n"); - unregister_net_sysctl_table(rtn->rds_tcp_sysctl); - rtn->rds_tcp_sysctl = NULL; - err = -EAFNOSUPPORT; - goto fail; + pr_warn("could not set up IPv6 listen sock\n"); + + /* Try IPv4 as some systems disable IPv6 */ + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false); + if (!rtn->rds_tcp_listen_sock) { + unregister_net_sysctl_table(rtn->rds_tcp_sysctl); + rtn->rds_tcp_sysctl = NULL; + err = -EAFNOSUPPORT; + goto fail; + } } INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); return 0; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index c6fa080..6a948c1 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -2,8 +2,6 @@ #ifndef _RDS_TCP_H #define _RDS_TCP_H -#define RDS_TCP_PORT 16385 - struct rds_tcp_incoming { struct rds_incoming ti_inc; struct sk_buff_head ti_skb_list; @@ -67,7 +65,7 @@ void rds_tcp_restore_callbacks(struct socket *sock, void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ -struct socket *rds_tcp_listen_init(struct net *); +struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 0101033..039bd04 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -89,9 +89,11 @@ void rds_tcp_state_change(struct sock *sk) int rds_tcp_conn_path_connect(struct rds_conn_path *cp) { struct socket *sock = NULL; + struct sockaddr_in6 sin6; struct sockaddr_in sin; struct sockaddr *addr; int addrlen; + bool isv6; int ret; struct rds_connection *conn = cp->cp_conn; struct rds_tcp_connection *tc = cp->cp_transport_data; @@ -108,18 +110,36 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) mutex_unlock(&tc->t_conn_path_lock); return 0; } - ret = sock_create_kern(rds_conn_net(conn), PF_INET, - SOCK_STREAM, IPPROTO_TCP, &sock); + if (ipv6_addr_v4mapped(&conn->c_laddr)) { + ret = sock_create_kern(rds_conn_net(conn), PF_INET, + SOCK_STREAM, IPPROTO_TCP, &sock); + isv6 = false; + } else { + ret = sock_create_kern(rds_conn_net(conn), PF_INET6, + SOCK_STREAM, IPPROTO_TCP, &sock); + isv6 = true; + } + if (ret < 0) goto out; rds_tcp_tune(sock); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = (__force u32)conn->c_laddr.s6_addr32[3]; - sin.sin_port = (__force u16)htons(0); - addr = (struct sockaddr *)&sin; - addrlen = sizeof(sin); + if (isv6) { + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = conn->c_laddr; + sin6.sin6_port = 0; + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = conn->c_dev_if; + addr = (struct sockaddr *)&sin6; + addrlen = sizeof(sin6); + } else { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = (__force u32)conn->c_laddr.s6_addr32[3]; + sin.sin_port = (__force u16)htons(0); + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); + } ret = sock->ops->bind(sock, addr, addrlen); if (ret) { @@ -128,11 +148,21 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) goto out; } - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = (__force u32)conn->c_faddr.s6_addr32[3]; - sin.sin_port = (__force u16)htons(RDS_TCP_PORT); - addr = (struct sockaddr *)&sin; - addrlen = sizeof(sin); + if (isv6) { + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = conn->c_faddr; + sin6.sin6_port = htons(RDS_TCP_PORT); + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = conn->c_dev_if; + addr = (struct sockaddr *)&sin6; + addrlen = sizeof(sin6); + } else { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = (__force u32)conn->c_faddr.s6_addr32[3]; + sin.sin_port = (__force u16)htons(RDS_TCP_PORT); + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); + } /* * once we call connect() we can start getting callbacks and they diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 4fdf5b3..0f996e4 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -256,15 +256,22 @@ void rds_tcp_listen_data_ready(struct sock *sk) ready(sk); } -struct socket *rds_tcp_listen_init(struct net *net) +struct socket *rds_tcp_listen_init(struct net *net, bool isv6) { - struct sockaddr_in sin; struct socket *sock = NULL; + struct sockaddr_storage ss; + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; + int addr_len; int ret; - ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (ret < 0) + ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM, + IPPROTO_TCP, &sock); + if (ret < 0) { + rdsdebug("could not create %s listener socket: %d\n", + isv6 ? "IPv6" : "IPv4", ret); goto out; + } sock->sk->sk_reuse = SK_CAN_REUSE; rds_tcp_nonagle(sock); @@ -274,13 +281,28 @@ struct socket *rds_tcp_listen_init(struct net *net) sock->sk->sk_data_ready = rds_tcp_listen_data_ready; write_unlock_bh(&sock->sk->sk_callback_lock); - sin.sin_family = PF_INET; - sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); - sin.sin_port = (__force u16)htons(RDS_TCP_PORT); + if (isv6) { + sin6 = (struct sockaddr_in6 *)&ss; + sin6->sin6_family = PF_INET6; + sin6->sin6_addr = in6addr_any; + sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); + sin6->sin6_scope_id = 0; + sin6->sin6_flowinfo = 0; + addr_len = sizeof(*sin6); + } else { + sin = (struct sockaddr_in *)&ss; + sin->sin_family = PF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + sin->sin_port = (__force u16)htons(RDS_TCP_PORT); + addr_len = sizeof(*sin); + } - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); - if (ret < 0) + ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len); + if (ret < 0) { + rdsdebug("could not bind %s listener socket: %d\n", + isv6 ? "IPv6" : "IPv4", ret); goto out; + } ret = sock->ops->listen(sock, 64); if (ret < 0) -- 1.8.3.1