RDS module sits on top of TCP (rds_tcp) and IB (rds_rdma), so messages arrive in form of skb (over TCP) and scatterlist (over IB/RDMA). However, because socket filter only deal with skb (e.g. struct skb as bpf context) we can only use socket filter for rds_tcp and not for rds_rdma.
Considering one filtering solution for RDS, it seems that the common denominator between sk_buff and scatterlist is scatterlist. Therefore, this patch converts skb to sgvec and invoke sg_filter_run for rds_tcp and simply invoke sg_filter_run for IB/rds_rdma. Signed-off-by: Tushar Dave <tushar.n.d...@oracle.com> Reviewed-by: Sowmini Varadhan <sowmini.varad...@oracle.com> --- net/rds/ib.c | 1 + net/rds/ib.h | 1 + net/rds/ib_recv.c | 12 ++++++++++++ net/rds/rds.h | 2 ++ net/rds/recv.c | 17 +++++++++++++++++ net/rds/tcp.c | 2 ++ net/rds/tcp.h | 2 ++ net/rds/tcp_recv.c | 38 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 75 insertions(+) diff --git a/net/rds/ib.c b/net/rds/ib.c index 89c6333..6ba1f75 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -532,6 +532,7 @@ struct rds_transport rds_ib_transport = { .conn_path_shutdown = rds_ib_conn_path_shutdown, .inc_copy_to_user = rds_ib_inc_copy_to_user, .inc_free = rds_ib_inc_free, + .inc_to_sg_get = rds_ib_inc_to_sg_get, .cm_initiate_connect = rds_ib_cm_initiate_connect, .cm_handle_connect = rds_ib_cm_handle_connect, .cm_connect_complete = rds_ib_cm_connect_complete, diff --git a/net/rds/ib.h b/net/rds/ib.h index 73427ff..0a12b41 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -404,6 +404,7 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, void rds_ib_recv_free_caches(struct rds_ib_connection *ic); void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); void rds_ib_inc_free(struct rds_incoming *inc); +int rds_ib_inc_to_sg_get(struct rds_incoming *inc, struct scatterlist **sg); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc, struct rds_ib_ack_state *state); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index d300186..2f76a91 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -219,6 +219,18 @@ void rds_ib_inc_free(struct rds_incoming *inc) rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs); } +int rds_ib_inc_to_sg_get(struct rds_incoming *inc, struct scatterlist **sg) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item); + *sg = &frag->f_sg; + + return 0; +} + static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, struct rds_ib_recv_work *recv) { diff --git a/net/rds/rds.h b/net/rds/rds.h index c4dcf65..abcd5ce 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -542,6 +542,8 @@ struct rds_transport { int (*recv_path)(struct rds_conn_path *cp); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to); void (*inc_free)(struct rds_incoming *inc); + int (*inc_to_sg_get)(struct rds_incoming *inc, struct scatterlist **sg); + void (*inc_to_sg_put)(struct scatterlist **sg); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, struct rdma_cm_event *event, bool isv6); diff --git a/net/rds/recv.c b/net/rds/recv.c index 504cd6b..261904c 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -292,6 +292,8 @@ void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, struct sock *sk; unsigned long flags; struct rds_conn_path *cp; + struct sk_filter *filter; + int result = __SOCKSG_PASS; inc->i_conn = conn; inc->i_rx_jiffies = jiffies; @@ -376,6 +378,21 @@ void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, /* We can be racing with rds_release() which marks the socket dead. */ sk = rds_rs_to_sk(rs); + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); + if (filter) { + if (conn->c_trans->inc_to_sg_get) { + struct scatterlist *sg; + + if (conn->c_trans->inc_to_sg_get(inc, &sg) == 0) { + result = sg_filter_run(sk, sg); + if (conn->c_trans->inc_to_sg_put) + conn->c_trans->inc_to_sg_put(&sg); + } + } + } + rcu_read_unlock(); + /* serialize with rds_release -> sock_orphan */ write_lock_irqsave(&rs->rs_recv_lock, flags); if (!sock_flag(sk, SOCK_DEAD)) { diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 2c7b7c3..35454c7 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -465,6 +465,8 @@ struct rds_transport rds_tcp_transport = { .conn_path_shutdown = rds_tcp_conn_path_shutdown, .inc_copy_to_user = rds_tcp_inc_copy_to_user, .inc_free = rds_tcp_inc_free, + .inc_to_sg_get = rds_tcp_inc_to_sg_get, + .inc_to_sg_put = rds_tcp_inc_to_sg_put, .stats_info_copy = rds_tcp_stats_info_copy, .exit = rds_tcp_exit, .t_owner = THIS_MODULE, diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 3c69361..b2cc910 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -82,6 +82,8 @@ void rds_tcp_restore_callbacks(struct socket *sock, int rds_tcp_recv_path(struct rds_conn_path *cp); void rds_tcp_inc_free(struct rds_incoming *inc); int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); +int rds_tcp_inc_to_sg_get(struct rds_incoming *inc, struct scatterlist **sg); +void rds_tcp_inc_to_sg_put(struct scatterlist **sg); /* tcp_send.c */ void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 42c5ff1..b45e69b 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -56,6 +56,44 @@ void rds_tcp_inc_free(struct rds_incoming *inc) kmem_cache_free(rds_tcp_incoming_slab, tinc); } +#define MAX_SG MAX_SKB_FRAGS +int rds_tcp_inc_to_sg_get(struct rds_incoming *inc, struct scatterlist **sg) +{ + struct scatterlist *sg_list; + struct rds_tcp_incoming *tinc; + struct sk_buff *skb; + int num_sg = 0; + + tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); + + /* For now we are assuming that the max sg elements we need is MAX_SG. + * To determine actual number of sg elements we need to traverse the + * skb queue e.g. + * + * skb_queue_walk(&tinc->ti_skb_list, skb) { + * num_sg += skb_shinfo(skb)->nr_frags + 1; + * } + */ + sg_list = kzalloc(sizeof(*sg_list) * MAX_SG, GFP_KERNEL); + if (!sg_list) + return -ENOMEM; + + sg_init_table(sg_list, MAX_SG); + skb_queue_walk(&tinc->ti_skb_list, skb) { + num_sg += skb_to_sgvec_nomark(skb, &sg_list[num_sg], 0, + skb->len); + } + sg_mark_end(&sg_list[num_sg - 1]); + *sg = sg_list; + + return 0; +} + +void rds_tcp_inc_to_sg_put(struct scatterlist **sg) +{ + kfree(*sg); +} + /* * this is pretty lame, but, whatever. */ -- 1.8.3.1