From: Paolo Abeni <pab...@redhat.com>

This makes mptcp sendmsg() behaviour more consistent and
improves xmit performances.

Signed-off-by: Paolo Abeni <pab...@redhat.com>
---
 net/mptcp/protocol.c | 126 ++++++++++++++++++++++++-------------------
 1 file changed, 71 insertions(+), 55 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index da983ea4fb5e..758369256f9b 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -58,71 +58,37 @@ static struct sock *mptcp_subflow_get_ref(const struct 
mptcp_sock *msk)
        return NULL;
 }
 
-static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
+                             struct msghdr *msg, long *timeo)
 {
        int mss_now = 0, size_goal = 0, ret = 0;
        struct mptcp_sock *msk = mptcp_sk(sk);
-       struct socket *ssock;
-       struct sock *ssk;
        struct mptcp_ext *mpext = NULL;
        struct page_frag *pfrag;
        struct sk_buff *skb;
        size_t psize;
-       int poffset;
-       long timeo;
-
-       lock_sock(sk);
-       ssock = __mptcp_fallback_get_ref(msk);
-       if (ssock) {
-               release_sock(sk);
-               pr_debug("fallback passthrough");
-               ret = sock_sendmsg(ssock, msg);
-               sock_put(ssock->sk);
-               return ret;
-       }
-
-       ssk = mptcp_subflow_get_ref(msk);
-       if (!ssk) {
-               release_sock(sk);
-               return -ENOTCONN;
-       }
-
-       if (!msg_data_left(msg)) {
-               pr_debug("empty send");
-               ret = sock_sendmsg(ssk->sk_socket, msg);
-               goto put_out;
-       }
-
-       if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) {
-               ret = -ENOTSUPP;
-               goto put_out;
-       }
-
-       lock_sock(ssk);
-       timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
        /* use the mptcp page cache so that we can easily move the data
         * from one substream to another, but do per subflow memory accounting
         */
        pfrag = sk_page_frag(sk);
        while (!sk_page_frag_refill(ssk, pfrag)) {
-               ret = sk_stream_wait_memory(ssk, &timeo);
+               ret = sk_stream_wait_memory(ssk, timeo);
                if (ret)
-                       goto put_out;
+                       return ret;
        }
 
-       /* Copy to page */
-       poffset = pfrag->offset;
+       /* compute copy limit */
+       mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
+       psize = min_t(int, pfrag->size - pfrag->offset, size_goal);
+
        pr_debug("left=%zu", msg_data_left(msg));
-       psize = copy_page_from_iter(pfrag->page, poffset,
-                                   min_t(size_t, msg_data_left(msg),
-                                         pfrag->size - poffset),
+       psize = copy_page_from_iter(pfrag->page, pfrag->offset,
+                                   min_t(size_t, msg_data_left(msg), psize),
                                    &msg->msg_iter);
        pr_debug("left=%zu", msg_data_left(msg));
-       if (!psize) {
-               ret = -EINVAL;
-               goto put_out;
-       }
+       if (!psize)
+               return -EINVAL;
 
        /* Mark the end of the previous write so the beginning of the
         * next write (with its own mptcp skb extension data) is not
@@ -132,20 +98,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t len)
        if (skb)
                TCP_SKB_CB(skb)->eor = 1;
 
-       mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
-       psize = min_t(int, size_goal, psize);
-       ret = do_tcp_sendpages(ssk, pfrag->page, poffset, psize,
+       ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
                               msg->msg_flags | MSG_SENDPAGE_NOTLAST);
        if (ret <= 0)
-               goto put_out;
-
-       if (skb == tcp_write_queue_tail(ssk))
-               pr_err("no new skb %p/%p", sk, ssk);
+               return ret;
+       if (unlikely(ret < psize))
+               iov_iter_revert(&msg->msg_iter, psize - ret);
 
        skb = tcp_write_queue_tail(ssk);
-
        mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
-
        if (mpext) {
                memset(mpext, 0, sizeof(*mpext));
                mpext->data_seq = msk->write_seq;
@@ -165,6 +126,61 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t len)
        mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
 
        tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal);
+       return ret;
+}
+
+static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+       struct mptcp_sock *msk = mptcp_sk(sk);
+       struct socket *ssock;
+       size_t copied = 0;
+       struct sock *ssk;
+       int ret = 0;
+       long timeo;
+
+       lock_sock(sk);
+       ssock = __mptcp_fallback_get_ref(msk);
+       if (ssock) {
+               release_sock(sk);
+               pr_debug("fallback passthrough");
+               ret = sock_sendmsg(ssock, msg);
+               sock_put(ssock->sk);
+               return ret;
+       }
+
+       ssk = mptcp_subflow_get_ref(msk);
+       if (!ssk) {
+               release_sock(sk);
+               return -ENOTCONN;
+       }
+
+       if (!msg_data_left(msg)) {
+               pr_debug("empty send");
+               ret = sock_sendmsg(ssk->sk_socket, msg);
+               goto put_out;
+       }
+
+       pr_debug("conn_list->subflow=%p", ssk);
+
+       if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) {
+               ret = -ENOTSUPP;
+               goto put_out;
+       }
+
+       lock_sock(ssk);
+       timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+       while (msg_data_left(msg)) {
+               ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo);
+               if (ret < 0)
+                       break;
+
+               copied += ret;
+       }
+
+       if (copied > 0)
+               ret = copied;
+
+       release_sock(ssk);
 
 put_out:
        release_sock(sk);
-- 
2.23.0

Reply via email to