Since moving to memalloc_nofs_save/restore, SUNRPC has stopped setting the
GFP_NOIO flag on sk_allocation which the networking system uses to decide
when it is safe to use current->task_frag.  The results of this are
unexpected corruption in task_frag when SUNRPC is involved in memory
reclaim.

The corruption can be seen in crashes, but the root cause is often
difficult to ascertain as a crashing machine's stack trace will have no
evidence of being near NFS or SUNRPC code.  I believe this problem to
be much more pervasive than reports to the community may indicate.

Fix this by having kernel users of sockets that may corrupt task_frag due
to reclaim set sk_use_task_frag = false.  Preemptively correcting this
situation for users that still set sk_allocation allows them to convert to
memalloc_nofs_save/restore without the same unexpected corruptions that are
sure to follow, unlikely to show up in testing, and difficult to bisect.

CC: Philipp Reisner <[email protected]>
CC: Lars Ellenberg <[email protected]>
CC: "Christoph Böhmwalder" <[email protected]>
CC: Jens Axboe <[email protected]>
CC: Josef Bacik <[email protected]>
CC: Keith Busch <[email protected]>
CC: Christoph Hellwig <[email protected]>
CC: Sagi Grimberg <[email protected]>
CC: Lee Duncan <[email protected]>
CC: Chris Leech <[email protected]>
CC: Mike Christie <[email protected]>
CC: "James E.J. Bottomley" <[email protected]>
CC: "Martin K. Petersen" <[email protected]>
CC: Valentina Manea <[email protected]>
CC: Shuah Khan <[email protected]>
CC: Greg Kroah-Hartman <[email protected]>
CC: David Howells <[email protected]>
CC: Marc Dionne <[email protected]>
CC: Steve French <[email protected]>
CC: Christine Caulfield <[email protected]>
CC: David Teigland <[email protected]>
CC: Mark Fasheh <[email protected]>
CC: Joel Becker <[email protected]>
CC: Joseph Qi <[email protected]>
CC: Eric Van Hensbergen <[email protected]>
CC: Latchesar Ionkov <[email protected]>
CC: Dominique Martinet <[email protected]>
CC: "David S. Miller" <[email protected]>
CC: Eric Dumazet <[email protected]>
CC: Jakub Kicinski <[email protected]>
CC: Paolo Abeni <[email protected]>
CC: Ilya Dryomov <[email protected]>
CC: Xiubo Li <[email protected]>
CC: Chuck Lever <[email protected]>
CC: Jeff Layton <[email protected]>
CC: Trond Myklebust <[email protected]>
CC: Anna Schumaker <[email protected]>
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]
CC: [email protected]

Suggested-by: Guillaume Nault <[email protected]>
Signed-off-by: Benjamin Coddington <[email protected]>
---
 drivers/block/drbd/drbd_receiver.c | 3 +++
 drivers/block/nbd.c                | 1 +
 drivers/nvme/host/tcp.c            | 1 +
 drivers/scsi/iscsi_tcp.c           | 1 +
 drivers/usb/usbip/usbip_common.c   | 1 +
 fs/afs/rxrpc.c                     | 1 +
 fs/cifs/connect.c                  | 1 +
 fs/dlm/lowcomms.c                  | 2 ++
 fs/ocfs2/cluster/tcp.c             | 1 +
 net/9p/trans_fd.c                  | 1 +
 net/ceph/messenger.c               | 1 +
 net/sunrpc/xprtsock.c              | 3 +++
 12 files changed, 17 insertions(+)

diff --git a/drivers/block/drbd/drbd_receiver.c 
b/drivers/block/drbd/drbd_receiver.c
index af4c7d65490b..09ad8d82c200 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1030,6 +1030,9 @@ static int conn_connect(struct drbd_connection 
*connection)
        sock.socket->sk->sk_allocation = GFP_NOIO;
        msock.socket->sk->sk_allocation = GFP_NOIO;
 
+       sock.socket->sk->sk_use_task_frag = false;
+       msock.socket->sk->sk_use_task_frag = false;
+
        sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
        msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
 
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 2a709daefbc4..815ee631ed30 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -514,6 +514,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int 
send,
        noreclaim_flag = memalloc_noreclaim_save();
        do {
                sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
+               sock->sk->sk_use_task_frag = false;
                msg.msg_name = NULL;
                msg.msg_namelen = 0;
                msg.msg_control = NULL;
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index d5871fd6f769..e01d78858cb4 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1531,6 +1531,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
        queue->sock->sk->sk_rcvtimeo = 10 * HZ;
 
        queue->sock->sk->sk_allocation = GFP_ATOMIC;
+       queue->sock->sk->sk_use_task_frag = false;
        nvme_tcp_set_queue_io_cpu(queue);
        queue->request = NULL;
        queue->data_remaining = 0;
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 29b1bd755afe..733e540d0abf 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -733,6 +733,7 @@ iscsi_sw_tcp_conn_bind(struct iscsi_cls_session 
*cls_session,
        sk->sk_reuse = SK_CAN_REUSE;
        sk->sk_sndtimeo = 15 * HZ; /* FIXME: make it configurable */
        sk->sk_allocation = GFP_ATOMIC;
+       sk->sk_use_task_frag = false;
        sk_set_memalloc(sk);
        sock_no_linger(sk);
 
diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c
index 2ab99244bc31..76bfc6e43881 100644
--- a/drivers/usb/usbip/usbip_common.c
+++ b/drivers/usb/usbip/usbip_common.c
@@ -315,6 +315,7 @@ int usbip_recv(struct socket *sock, void *buf, int size)
 
        do {
                sock->sk->sk_allocation = GFP_NOIO;
+               sock->sk->sk_use_task_frag = false;
 
                result = sock_recvmsg(sock, &msg, MSG_WAITALL);
                if (result <= 0)
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index eccc3cd0cb70..ac75ad18db83 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -46,6 +46,7 @@ int afs_open_socket(struct afs_net *net)
                goto error_1;
 
        socket->sk->sk_allocation = GFP_NOFS;
+       socket->sk->sk_use_task_frag = false;
 
        /* bind the callback manager's address to make this a server socket */
        memset(&srx, 0, sizeof(srx));
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7ae6f2c08153..c2b0d6f59f79 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2935,6 +2935,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
                cifs_dbg(FYI, "Socket created\n");
                server->ssocket = socket;
                socket->sk->sk_allocation = GFP_NOFS;
+               socket->sk->sk_use_task_frag = false;
                if (sfamily == AF_INET6)
                        cifs_reclassify_socket6(socket);
                else
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index a4e84e8d94c8..4cf29ac3c428 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -699,6 +699,7 @@ static void add_listen_sock(struct socket *sock, struct 
listen_connection *con)
 
        sk->sk_user_data = con;
        sk->sk_allocation = GFP_NOFS;
+       sk->sk_use_task_frag = false;
        /* Install a data_ready callback */
        sk->sk_data_ready = lowcomms_listen_data_ready;
        release_sock(sk);
@@ -718,6 +719,7 @@ static void add_sock(struct socket *sock, struct connection 
*con)
        sk->sk_write_space = lowcomms_write_space;
        sk->sk_state_change = lowcomms_state_change;
        sk->sk_allocation = GFP_NOFS;
+       sk->sk_use_task_frag = false;
        sk->sk_error_report = lowcomms_error_report;
        release_sock(sk);
 }
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index f660c0dbdb63..3eaafa5e5ec4 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1604,6 +1604,7 @@ static void o2net_start_connect(struct work_struct *work)
        sc->sc_sock = sock; /* freed by sc_kref_release */
 
        sock->sk->sk_allocation = GFP_ATOMIC;
+       sock->sk->sk_use_task_frag = false;
 
        myaddr.sin_family = AF_INET;
        myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index e758978b44be..96f803499323 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -851,6 +851,7 @@ static int p9_socket_open(struct p9_client *client, struct 
socket *csocket)
                return -ENOMEM;
 
        csocket->sk->sk_allocation = GFP_NOIO;
+       csocket->sk->sk_use_task_frag = false;
        file = sock_alloc_file(csocket, 0, NULL);
        if (IS_ERR(file)) {
                pr_err("%s (%d): failed to map fd\n",
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d3bb656308b4..cad8e0ca8432 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -446,6 +446,7 @@ int ceph_tcp_connect(struct ceph_connection *con)
        if (ret)
                return ret;
        sock->sk->sk_allocation = GFP_NOFS;
+       sock->sk->sk_use_task_frag = false;
 
 #ifdef CONFIG_LOCKDEP
        lockdep_set_class(&sock->sk->sk_lock, &socket_class);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e976007f4fd0..d3170b753dfc 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1882,6 +1882,7 @@ static int xs_local_finish_connecting(struct rpc_xprt 
*xprt,
                sk->sk_write_space = xs_udp_write_space;
                sk->sk_state_change = xs_local_state_change;
                sk->sk_error_report = xs_error_report;
+               sk->sk_use_task_frag = false;
 
                xprt_clear_connected(xprt);
 
@@ -2083,6 +2084,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt 
*xprt, struct socket *sock)
                sk->sk_user_data = xprt;
                sk->sk_data_ready = xs_data_ready;
                sk->sk_write_space = xs_udp_write_space;
+               sk->sk_use_task_frag = false;
 
                xprt_set_connected(xprt);
 
@@ -2250,6 +2252,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt 
*xprt, struct socket *sock)
                sk->sk_state_change = xs_tcp_state_change;
                sk->sk_write_space = xs_tcp_write_space;
                sk->sk_error_report = xs_error_report;
+               sk->sk_use_task_frag = false;
 
                /* socket options */
                sock_reset_flag(sk, SOCK_LINGER);
-- 
2.31.1

Reply via email to