[Patch v6 03/22] CIFS: SMBD: Add rdma mount option

2017-11-04 Thread Long Li
From: Long Li 

Add "rdma" to CIFS mount options to connect to SMB Direct.
Add checks to validate this is used on SMB 3.X dialects.

To connect to SMBDirect, use "mount.cifs -o rdma,vers=3.x".
At the time of this patch, 3.x can be 3.0, 3.02 or 3.1.1.

Signed-off-by: Long Li 
---
 fs/cifs/cifs_debug.c |  2 ++
 fs/cifs/cifsfs.c |  2 ++
 fs/cifs/cifsglob.h   |  7 +++
 fs/cifs/connect.c| 15 ++-
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 9727e1d..ba0870d 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -171,6 +171,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, 
void *v)
ses->ses_count, ses->serverOS, ses->serverNOS,
ses->capabilities, ses->status);
}
+   if (server->rdma)
+   seq_printf(m, "RDMA\n\t");
seq_printf(m, "TCP status: %d\n\tLocal Users To "
   "Server: %d SecMode: 0x%x Req On Wire: %d",
   server->tcpStatus, server->srv_count,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 180b335..e15fbf1 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -327,6 +327,8 @@ cifs_show_address(struct seq_file *s, struct 
TCP_Server_Info *server)
default:
seq_puts(s, "(unknown)");
}
+   if (server->rdma)
+   seq_puts(s, ",rdma");
 }
 
 static void
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 808486c..09f9a71 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -530,6 +530,7 @@ struct smb_vol {
bool nopersistent:1;
bool resilient:1; /* noresilient not required since not fored for CA */
bool domainauto:1;
+   bool rdma:1;
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -646,6 +647,12 @@ struct TCP_Server_Info {
boolsec_kerberos;   /* supports plain Kerberos */
boolsec_mskerberos; /* supports legacy MS Kerberos */
boollarge_buf;  /* is current buffer large? */
+   /* use SMBD connection instead of socket */
+   boolrdma;
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   /* point to the SMBD connection if RDMA is used instead of socket */
+   struct smbd_connection *smbd_conn;
+#endif
struct delayed_work echo; /* echo ping workqueue job */
char*smallbuf;  /* pointer to current "small" buffer */
char*bigbuf;/* pointer to current "big" buffer */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 59647eb..b5a575f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -92,7 +92,7 @@ enum {
Opt_multiuser, Opt_sloppy, Opt_nosharesock,
Opt_persistent, Opt_nopersistent,
Opt_resilient, Opt_noresilient,
-   Opt_domainauto,
+   Opt_domainauto, Opt_rdma,
 
/* Mount options which take numeric value */
Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -183,6 +183,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_resilient, "resilienthandles"},
{ Opt_noresilient, "noresilienthandles"},
{ Opt_domainauto, "domainauto"},
+   { Opt_rdma, "rdma"},
 
{ Opt_backupuid, "backupuid=%s" },
{ Opt_backupgid, "backupgid=%s" },
@@ -1538,6 +1539,9 @@ cifs_parse_mount_options(const char *mountdata, const 
char *devname,
case Opt_domainauto:
vol->domainauto = true;
break;
+   case Opt_rdma:
+   vol->rdma = true;
+   break;
 
/* Numeric Values */
case Opt_backupuid:
@@ -1928,6 +1932,11 @@ cifs_parse_mount_options(const char *mountdata, const 
char *devname,
goto cifs_parse_mount_err;
}
 
+   if (vol->rdma && vol->vals->protocol_id < SMB30_PROT_ID) {
+   cifs_dbg(VFS, "SMB Direct requires Version >=3.0\n");
+   goto cifs_parse_mount_err;
+   }
+
 #ifndef CONFIG_KEYS
/* Muliuser mounts require CONFIG_KEYS support */
if (vol->multiuser) {
@@ -2131,6 +2140,9 @@ static int match_server(struct TCP_Server_Info *server, 
struct smb_vol *vol)
if (server->echo_interval != vol->echo_interval * HZ)
return 0;
 
+   if (server->rdma != vol->rdma)
+   return 0;
+
return 1;
 }
 
@@ -2229,6 +2241,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
tcp_ses->noblocksnd = volume_info->noblocksnd;
tcp_ses->noautotune = volume_info->noautotune;
tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
+   tcp_ses->rdma = volume_info->rdma;
tcp_ses->in_flight = 0;
tcp_ses->credits = 1;

[Patch v6 03/22] CIFS: SMBD: Add rdma mount option

2017-11-04 Thread Long Li
From: Long Li 

Add "rdma" to CIFS mount options to connect to SMB Direct.
Add checks to validate this is used on SMB 3.X dialects.

To connect to SMBDirect, use "mount.cifs -o rdma,vers=3.x".
At the time of this patch, 3.x can be 3.0, 3.02 or 3.1.1.

Signed-off-by: Long Li 
---
 fs/cifs/cifs_debug.c |  2 ++
 fs/cifs/cifsfs.c |  2 ++
 fs/cifs/cifsglob.h   |  7 +++
 fs/cifs/connect.c| 15 ++-
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 9727e1d..ba0870d 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -171,6 +171,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, 
void *v)
ses->ses_count, ses->serverOS, ses->serverNOS,
ses->capabilities, ses->status);
}
+   if (server->rdma)
+   seq_printf(m, "RDMA\n\t");
seq_printf(m, "TCP status: %d\n\tLocal Users To "
   "Server: %d SecMode: 0x%x Req On Wire: %d",
   server->tcpStatus, server->srv_count,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 180b335..e15fbf1 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -327,6 +327,8 @@ cifs_show_address(struct seq_file *s, struct 
TCP_Server_Info *server)
default:
seq_puts(s, "(unknown)");
}
+   if (server->rdma)
+   seq_puts(s, ",rdma");
 }
 
 static void
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 808486c..09f9a71 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -530,6 +530,7 @@ struct smb_vol {
bool nopersistent:1;
bool resilient:1; /* noresilient not required since not fored for CA */
bool domainauto:1;
+   bool rdma:1;
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -646,6 +647,12 @@ struct TCP_Server_Info {
boolsec_kerberos;   /* supports plain Kerberos */
boolsec_mskerberos; /* supports legacy MS Kerberos */
boollarge_buf;  /* is current buffer large? */
+   /* use SMBD connection instead of socket */
+   boolrdma;
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   /* point to the SMBD connection if RDMA is used instead of socket */
+   struct smbd_connection *smbd_conn;
+#endif
struct delayed_work echo; /* echo ping workqueue job */
char*smallbuf;  /* pointer to current "small" buffer */
char*bigbuf;/* pointer to current "big" buffer */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 59647eb..b5a575f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -92,7 +92,7 @@ enum {
Opt_multiuser, Opt_sloppy, Opt_nosharesock,
Opt_persistent, Opt_nopersistent,
Opt_resilient, Opt_noresilient,
-   Opt_domainauto,
+   Opt_domainauto, Opt_rdma,
 
/* Mount options which take numeric value */
Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -183,6 +183,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_resilient, "resilienthandles"},
{ Opt_noresilient, "noresilienthandles"},
{ Opt_domainauto, "domainauto"},
+   { Opt_rdma, "rdma"},
 
{ Opt_backupuid, "backupuid=%s" },
{ Opt_backupgid, "backupgid=%s" },
@@ -1538,6 +1539,9 @@ cifs_parse_mount_options(const char *mountdata, const 
char *devname,
case Opt_domainauto:
vol->domainauto = true;
break;
+   case Opt_rdma:
+   vol->rdma = true;
+   break;
 
/* Numeric Values */
case Opt_backupuid:
@@ -1928,6 +1932,11 @@ cifs_parse_mount_options(const char *mountdata, const 
char *devname,
goto cifs_parse_mount_err;
}
 
+   if (vol->rdma && vol->vals->protocol_id < SMB30_PROT_ID) {
+   cifs_dbg(VFS, "SMB Direct requires Version >=3.0\n");
+   goto cifs_parse_mount_err;
+   }
+
 #ifndef CONFIG_KEYS
/* Muliuser mounts require CONFIG_KEYS support */
if (vol->multiuser) {
@@ -2131,6 +2140,9 @@ static int match_server(struct TCP_Server_Info *server, 
struct smb_vol *vol)
if (server->echo_interval != vol->echo_interval * HZ)
return 0;
 
+   if (server->rdma != vol->rdma)
+   return 0;
+
return 1;
 }
 
@@ -2229,6 +2241,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
tcp_ses->noblocksnd = volume_info->noblocksnd;
tcp_ses->noautotune = volume_info->noautotune;
tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
+   tcp_ses->rdma = volume_info->rdma;
tcp_ses->in_flight = 0;
tcp_ses->credits = 1;
init_waitqueue_head(_ses->response_q);
-- 
2.7.4



[Patch v6 04/22] CIFS: SMBD: Add SMB Direct protocol initial values and constants

2017-11-04 Thread Long Li
From: Long Li 

To prepare for protocol implementation, add constants and user-configurable
values for the SMB Direct protocol.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 77 +
 fs/cifs/smbdirect.h | 21 +++
 2 files changed, 98 insertions(+)
 create mode 100644 fs/cifs/smbdirect.c
 create mode 100644 fs/cifs/smbdirect.h

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
new file mode 100644
index 000..d3c16f8
--- /dev/null
+++ b/fs/cifs/smbdirect.c
@@ -0,0 +1,77 @@
+/*
+ *   Copyright (C) 2017, Microsoft Corporation.
+ *
+ *   Author(s): Long Li 
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ */
+#include "smbdirect.h"
+
+/* SMBD version number */
+#define SMBD_V10x0100
+
+/* Port numbers for SMBD transport */
+#define SMB_PORT   445
+#define SMBD_PORT  5445
+
+/* Address lookup and resolve timeout in ms */
+#define RDMA_RESOLVE_TIMEOUT   5000
+
+/* SMBD negotiation timeout in seconds */
+#define SMBD_NEGOTIATE_TIMEOUT 120
+
+/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
+#define SMBD_MIN_RECEIVE_SIZE  128
+#define SMBD_MIN_FRAGMENTED_SIZE   131072
+
+/*
+ * Default maximum number of RDMA read/write outstanding on this connection
+ * This value is possibly decreased during QP creation on hardware limit
+ */
+#define SMBD_CM_RESPONDER_RESOURCES32
+
+/* Maximum number of retries on data transfer operations */
+#define SMBD_CM_RETRY  6
+/* No need to retry on Receiver Not Ready since SMBD manages credits */
+#define SMBD_CM_RNR_RETRY  0
+
+/*
+ * User configurable initial values per SMBD transport connection
+ * as defined in [MS-SMBD] 3.1.1.1
+ * Those may change after a SMBD negotiation
+ */
+/* The local peer's maximum number of credits to grant to the peer */
+int smbd_receive_credit_max = 255;
+
+/* The remote peer's credit request of local peer */
+int smbd_send_credit_target = 255;
+
+/* The maximum single message size can be sent to remote peer */
+int smbd_max_send_size = 1364;
+
+/*  The maximum fragmented upper-layer payload receive size supported */
+int smbd_max_fragmented_recv_size = 1024 * 1024;
+
+/*  The maximum single-message size which can be received */
+int smbd_max_receive_size = 8192;
+
+/* The timeout to initiate send of a keepalive message on idle */
+int smbd_keep_alive_interval = 120;
+
+/*
+ * User configurable initial values for RDMA transport
+ * The actual values used may be lower and are limited to hardware capabilities
+ */
+/* Default maximum number of SGEs in a RDMA write/read */
+int smbd_max_frmr_depth = 2048;
+
+/* If payload is less than this byte, use RDMA send/recv not read/write */
+int rdma_readwrite_threshold = 4096;
diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h
new file mode 100644
index 000..c55f28b
--- /dev/null
+++ b/fs/cifs/smbdirect.h
@@ -0,0 +1,21 @@
+/*
+ *   Copyright (C) 2017, Microsoft Corporation.
+ *
+ *   Author(s): Long Li 
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ */
+#ifndef _SMBDIRECT_H
+#define _SMBDIRECT_H
+
+/* Default maximum number of SGEs in a RDMA send/recv */
+#define SMBDIRECT_MAX_SGE  16
+#endif
-- 
2.7.4



[Patch v6 21/22] CIFS: SMBD: Upper layer performs SMB read via RDMA write through memory registration

2017-11-04 Thread Long Li
From: Long Li 

If I/O size is larger than rdma_readwrite_threshold, use RDMA write for
SMB read by specifying channel SMB2_CHANNEL_RDMA_V1 or
SMB2_CHANNEL_RDMA_V1_INVALIDATE in the SMB packet, depending on SMB dialect
used. Append a smbd_buffer_descriptor_v1 to the end of the SMB packet and
fill in other values to indicate this SMB read uses RDMA write.

There is no need to read from the transport for incoming payload. At the
time SMB read response comes back, the data is already transfered and
placed in the pages by RDMA hardware.

When SMB read is finished, deregister the memory regions if RDMA write is
used for this SMB read. smbd_deregister_mr may need to do local
invalidation and sleep, if server remote invalidation is not used.

There are situations where the MID may not be created on I/O failure, under
which memory region is deregistered when read data context is released.

Signed-off-by: Long Li 
---
 fs/cifs/file.c| 19 +--
 fs/cifs/smb2pdu.c | 45 -
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0786f19..94479ef 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -42,7 +42,9 @@
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
 #include "fscache.h"
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 static inline int cifs_convert_flags(unsigned int flags)
 {
@@ -2908,7 +2910,12 @@ cifs_readdata_release(struct kref *refcount)
 {
struct cifs_readdata *rdata = container_of(refcount,
struct cifs_readdata, refcount);
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (rdata->mr) {
+   smbd_deregister_mr(rdata->mr);
+   rdata->mr = NULL;
+   }
+#endif
if (rdata->cfile)
cifsFileInfo_put(rdata->cfile);
 
@@ -3037,6 +3044,10 @@ uncached_fill_pages(struct TCP_Server_Info *server,
}
if (iter)
result = copy_page_from_iter(page, 0, n, iter);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   else if (rdata->mr)
+   result = n;
+#endif
else
result = cifs_read_page_from_socket(server, page, n);
if (result < 0)
@@ -3606,6 +3617,10 @@ readpages_fill_pages(struct TCP_Server_Info *server,
 
if (iter)
result = copy_page_from_iter(page, 0, n, iter);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   else if (rdata->mr)
+   result = n;
+#endif
else
result = cifs_read_page_from_socket(server, page, n);
if (result < 0)
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 8ef4a2f..f07eb37 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2381,7 +2381,40 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->MinimumCount = 0;
req->Length = cpu_to_le32(io_parms->length);
req->Offset = cpu_to_le64(io_parms->offset);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   /*
+* If we want to do a RDMA write, fill in and append
+* smbd_buffer_descriptor_v1 to the end of read request
+*/
+   if (server->rdma && rdata &&
+   rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) {
+
+   struct smbd_buffer_descriptor_v1 *v1;
+   bool need_invalidate =
+   io_parms->tcon->ses->server->dialect == SMB30_PROT_ID;
+
+   rdata->mr = smbd_register_mr(
+   server->smbd_conn, rdata->pages,
+   rdata->nr_pages, rdata->tailsz,
+   true, need_invalidate);
+   if (!rdata->mr)
+   return -ENOBUFS;
+
+   req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
+   if (need_invalidate)
+   req->Channel = SMB2_CHANNEL_RDMA_V1;
+   req->ReadChannelInfoOffset =
+   offsetof(struct smb2_read_plain_req, Buffer);
+   req->ReadChannelInfoLength =
+   sizeof(struct smbd_buffer_descriptor_v1);
+   v1 = (struct smbd_buffer_descriptor_v1 *) >Buffer[0];
+   v1->offset = rdata->mr->mr->iova;
+   v1->token = rdata->mr->mr->rkey;
+   v1->length = rdata->mr->mr->length;
 
+   *total_len += sizeof(*v1) - 1;
+   }
+#endif
if (request_type & CHAINED_REQUEST) {
if (!(request_type & END_OF_CHAIN)) {
/* next 8-byte aligned request */
@@ -2460,7 +2493,17 @@ smb2_readv_callback(struct mid_q_entry *mid)
if (rdata->result != -ENODATA)
rdata->result = -EIO;
}
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   /*
+* If this rdata has a memmory registered, the MR can be freed

[Patch v6 04/22] CIFS: SMBD: Add SMB Direct protocol initial values and constants

2017-11-04 Thread Long Li
From: Long Li 

To prepare for protocol implementation, add constants and user-configurable
values for the SMB Direct protocol.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 77 +
 fs/cifs/smbdirect.h | 21 +++
 2 files changed, 98 insertions(+)
 create mode 100644 fs/cifs/smbdirect.c
 create mode 100644 fs/cifs/smbdirect.h

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
new file mode 100644
index 000..d3c16f8
--- /dev/null
+++ b/fs/cifs/smbdirect.c
@@ -0,0 +1,77 @@
+/*
+ *   Copyright (C) 2017, Microsoft Corporation.
+ *
+ *   Author(s): Long Li 
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ */
+#include "smbdirect.h"
+
+/* SMBD version number */
+#define SMBD_V10x0100
+
+/* Port numbers for SMBD transport */
+#define SMB_PORT   445
+#define SMBD_PORT  5445
+
+/* Address lookup and resolve timeout in ms */
+#define RDMA_RESOLVE_TIMEOUT   5000
+
+/* SMBD negotiation timeout in seconds */
+#define SMBD_NEGOTIATE_TIMEOUT 120
+
+/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
+#define SMBD_MIN_RECEIVE_SIZE  128
+#define SMBD_MIN_FRAGMENTED_SIZE   131072
+
+/*
+ * Default maximum number of RDMA read/write outstanding on this connection
+ * This value is possibly decreased during QP creation on hardware limit
+ */
+#define SMBD_CM_RESPONDER_RESOURCES32
+
+/* Maximum number of retries on data transfer operations */
+#define SMBD_CM_RETRY  6
+/* No need to retry on Receiver Not Ready since SMBD manages credits */
+#define SMBD_CM_RNR_RETRY  0
+
+/*
+ * User configurable initial values per SMBD transport connection
+ * as defined in [MS-SMBD] 3.1.1.1
+ * Those may change after a SMBD negotiation
+ */
+/* The local peer's maximum number of credits to grant to the peer */
+int smbd_receive_credit_max = 255;
+
+/* The remote peer's credit request of local peer */
+int smbd_send_credit_target = 255;
+
+/* The maximum single message size can be sent to remote peer */
+int smbd_max_send_size = 1364;
+
+/*  The maximum fragmented upper-layer payload receive size supported */
+int smbd_max_fragmented_recv_size = 1024 * 1024;
+
+/*  The maximum single-message size which can be received */
+int smbd_max_receive_size = 8192;
+
+/* The timeout to initiate send of a keepalive message on idle */
+int smbd_keep_alive_interval = 120;
+
+/*
+ * User configurable initial values for RDMA transport
+ * The actual values used may be lower and are limited to hardware capabilities
+ */
+/* Default maximum number of SGEs in a RDMA write/read */
+int smbd_max_frmr_depth = 2048;
+
+/* If payload is less than this byte, use RDMA send/recv not read/write */
+int rdma_readwrite_threshold = 4096;
diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h
new file mode 100644
index 000..c55f28b
--- /dev/null
+++ b/fs/cifs/smbdirect.h
@@ -0,0 +1,21 @@
+/*
+ *   Copyright (C) 2017, Microsoft Corporation.
+ *
+ *   Author(s): Long Li 
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ */
+#ifndef _SMBDIRECT_H
+#define _SMBDIRECT_H
+
+/* Default maximum number of SGEs in a RDMA send/recv */
+#define SMBDIRECT_MAX_SGE  16
+#endif
-- 
2.7.4



[Patch v6 21/22] CIFS: SMBD: Upper layer performs SMB read via RDMA write through memory registration

2017-11-04 Thread Long Li
From: Long Li 

If I/O size is larger than rdma_readwrite_threshold, use RDMA write for
SMB read by specifying channel SMB2_CHANNEL_RDMA_V1 or
SMB2_CHANNEL_RDMA_V1_INVALIDATE in the SMB packet, depending on SMB dialect
used. Append a smbd_buffer_descriptor_v1 to the end of the SMB packet and
fill in other values to indicate this SMB read uses RDMA write.

There is no need to read from the transport for incoming payload. At the
time SMB read response comes back, the data is already transfered and
placed in the pages by RDMA hardware.

When SMB read is finished, deregister the memory regions if RDMA write is
used for this SMB read. smbd_deregister_mr may need to do local
invalidation and sleep, if server remote invalidation is not used.

There are situations where the MID may not be created on I/O failure, under
which memory region is deregistered when read data context is released.

Signed-off-by: Long Li 
---
 fs/cifs/file.c| 19 +--
 fs/cifs/smb2pdu.c | 45 -
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0786f19..94479ef 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -42,7 +42,9 @@
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
 #include "fscache.h"
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 static inline int cifs_convert_flags(unsigned int flags)
 {
@@ -2908,7 +2910,12 @@ cifs_readdata_release(struct kref *refcount)
 {
struct cifs_readdata *rdata = container_of(refcount,
struct cifs_readdata, refcount);
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (rdata->mr) {
+   smbd_deregister_mr(rdata->mr);
+   rdata->mr = NULL;
+   }
+#endif
if (rdata->cfile)
cifsFileInfo_put(rdata->cfile);
 
@@ -3037,6 +3044,10 @@ uncached_fill_pages(struct TCP_Server_Info *server,
}
if (iter)
result = copy_page_from_iter(page, 0, n, iter);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   else if (rdata->mr)
+   result = n;
+#endif
else
result = cifs_read_page_from_socket(server, page, n);
if (result < 0)
@@ -3606,6 +3617,10 @@ readpages_fill_pages(struct TCP_Server_Info *server,
 
if (iter)
result = copy_page_from_iter(page, 0, n, iter);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   else if (rdata->mr)
+   result = n;
+#endif
else
result = cifs_read_page_from_socket(server, page, n);
if (result < 0)
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 8ef4a2f..f07eb37 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2381,7 +2381,40 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->MinimumCount = 0;
req->Length = cpu_to_le32(io_parms->length);
req->Offset = cpu_to_le64(io_parms->offset);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   /*
+* If we want to do a RDMA write, fill in and append
+* smbd_buffer_descriptor_v1 to the end of read request
+*/
+   if (server->rdma && rdata &&
+   rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) {
+
+   struct smbd_buffer_descriptor_v1 *v1;
+   bool need_invalidate =
+   io_parms->tcon->ses->server->dialect == SMB30_PROT_ID;
+
+   rdata->mr = smbd_register_mr(
+   server->smbd_conn, rdata->pages,
+   rdata->nr_pages, rdata->tailsz,
+   true, need_invalidate);
+   if (!rdata->mr)
+   return -ENOBUFS;
+
+   req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
+   if (need_invalidate)
+   req->Channel = SMB2_CHANNEL_RDMA_V1;
+   req->ReadChannelInfoOffset =
+   offsetof(struct smb2_read_plain_req, Buffer);
+   req->ReadChannelInfoLength =
+   sizeof(struct smbd_buffer_descriptor_v1);
+   v1 = (struct smbd_buffer_descriptor_v1 *) >Buffer[0];
+   v1->offset = rdata->mr->mr->iova;
+   v1->token = rdata->mr->mr->rkey;
+   v1->length = rdata->mr->mr->length;
 
+   *total_len += sizeof(*v1) - 1;
+   }
+#endif
if (request_type & CHAINED_REQUEST) {
if (!(request_type & END_OF_CHAIN)) {
/* next 8-byte aligned request */
@@ -2460,7 +2493,17 @@ smb2_readv_callback(struct mid_q_entry *mid)
if (rdata->result != -ENODATA)
rdata->result = -EIO;
}
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   /*
+* If this rdata has a memmory registered, the MR can be freed
+* MR needs to be freed as soon as 

[Patch v6 13/22] CIFS: SMBD: Set SMB Direct maximum read or write size for I/O

2017-11-04 Thread Long Li
From: Long Li 

When connecting over SMB Direct, the transport negotiates its maximum I/O
sizes with the server and determines how to choose to do RDMA send/recv vs
read/write. Expose these maximum I/O sizes to upper layer so we will get
the correct sized payloads.

Signed-off-by: Long Li 
---
 fs/cifs/smb2ops.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index fb2934b..25028da 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -32,6 +32,9 @@
 #include "smb2status.h"
 #include "smb2glob.h"
 #include "cifs_ioctl.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 static int
 change_conf(struct TCP_Server_Info *server)
@@ -250,7 +253,11 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct 
smb_vol *volume_info)
/* start with specified wsize, or default */
wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (server->rdma)
+   wsize = min_t(unsigned int,
+   wsize, server->smbd_conn->max_readwrite_size);
+#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
 
@@ -266,6 +273,11 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct 
smb_vol *volume_info)
/* start with specified rsize, or default */
rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (server->rdma)
+   rsize = min_t(unsigned int,
+   rsize, server->smbd_conn->max_readwrite_size);
+#endif
 
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
-- 
2.7.4



[Patch v6 13/22] CIFS: SMBD: Set SMB Direct maximum read or write size for I/O

2017-11-04 Thread Long Li
From: Long Li 

When connecting over SMB Direct, the transport negotiates its maximum I/O
sizes with the server and determines how to choose to do RDMA send/recv vs
read/write. Expose these maximum I/O sizes to upper layer so we will get
the correct sized payloads.

Signed-off-by: Long Li 
---
 fs/cifs/smb2ops.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index fb2934b..25028da 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -32,6 +32,9 @@
 #include "smb2status.h"
 #include "smb2glob.h"
 #include "cifs_ioctl.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 static int
 change_conf(struct TCP_Server_Info *server)
@@ -250,7 +253,11 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct 
smb_vol *volume_info)
/* start with specified wsize, or default */
wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (server->rdma)
+   wsize = min_t(unsigned int,
+   wsize, server->smbd_conn->max_readwrite_size);
+#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
 
@@ -266,6 +273,11 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct 
smb_vol *volume_info)
/* start with specified rsize, or default */
rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (server->rdma)
+   rsize = min_t(unsigned int,
+   rsize, server->smbd_conn->max_readwrite_size);
+#endif
 
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
-- 
2.7.4



[Patch v6 07/22] CIFS: SMBD: Implement function to create a SMB Direct connection

2017-11-04 Thread Long Li
From: Long Li 

The upper layer calls this function to connect to peer through SMB Direct.
Each SMB Direct connection is based on a RDMA RC Queue Pair.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 7af49cd..47d999f 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1652,3 +1652,20 @@ struct smbd_connection *_smbd_get_connection(
kfree(info);
return NULL;
 }
+
+struct smbd_connection *smbd_get_connection(
+   struct TCP_Server_Info *server, struct sockaddr *dstaddr)
+{
+   struct smbd_connection *ret;
+   int port = SMBD_PORT;
+
+try_again:
+   ret = _smbd_get_connection(server, dstaddr, port);
+
+   /* Try SMB_PORT if SMBD_PORT doesn't work */
+   if (!ret && port == SMBD_PORT) {
+   port = SMB_PORT;
+   goto try_again;
+   }
+   return ret;
+}
-- 
2.7.4



[Patch v6 09/22] CIFS: SMBD: Implement function to reconnect to a SMB Direct transport

2017-11-04 Thread Long Li
From: Long Li 

Add function to implement a reconnect to SMB Direct. This involves tearing
down the current connection and establishing/negotiating a new connection.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 47d999f..f3ae3dc 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1393,6 +1393,42 @@ static void idle_connection_timer(struct work_struct 
*work)
info->keep_alive_interval*HZ);
 }
 
+/*
+ * Reconnect this SMBD connection, called from upper layer
+ * return value: 0 on success, or actual error code
+ */
+int smbd_reconnect(struct TCP_Server_Info *server)
+{
+   log_rdma_event(INFO, "reconnecting rdma session\n");
+
+   if (!server->smbd_conn) {
+   log_rdma_event(ERR, "rdma session already destroyed\n");
+   return -EINVAL;
+   }
+
+   /*
+* This is possible if transport is disconnected and we haven't received
+* notification from RDMA, but upper layer has detected timeout
+*/
+   if (server->smbd_conn->transport_status == SMBD_CONNECTED) {
+   log_rdma_event(INFO, "disconnecting transport\n");
+   smbd_disconnect_rdma_connection(server->smbd_conn);
+   }
+
+   /* wait until the transport is destroyed */
+   wait_event(server->smbd_conn->wait_destroy,
+   server->smbd_conn->transport_status == SMBD_DESTROYED);
+
+   destroy_workqueue(server->smbd_conn->workqueue);
+   kfree(server->smbd_conn);
+
+   log_rdma_event(INFO, "creating rdma session\n");
+   server->smbd_conn = smbd_get_connection(
+   server, (struct sockaddr *) >dstaddr);
+
+   return server->smbd_conn ? 0 : -ENOENT;
+}
+
 static void destroy_caches_and_workqueue(struct smbd_connection *info)
 {
destroy_receive_buffers(info);
-- 
2.7.4



[Patch v6 07/22] CIFS: SMBD: Implement function to create a SMB Direct connection

2017-11-04 Thread Long Li
From: Long Li 

The upper layer calls this function to connect to peer through SMB Direct.
Each SMB Direct connection is based on a RDMA RC Queue Pair.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 7af49cd..47d999f 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1652,3 +1652,20 @@ struct smbd_connection *_smbd_get_connection(
kfree(info);
return NULL;
 }
+
+struct smbd_connection *smbd_get_connection(
+   struct TCP_Server_Info *server, struct sockaddr *dstaddr)
+{
+   struct smbd_connection *ret;
+   int port = SMBD_PORT;
+
+try_again:
+   ret = _smbd_get_connection(server, dstaddr, port);
+
+   /* Try SMB_PORT if SMBD_PORT doesn't work */
+   if (!ret && port == SMBD_PORT) {
+   port = SMB_PORT;
+   goto try_again;
+   }
+   return ret;
+}
-- 
2.7.4



[Patch v6 09/22] CIFS: SMBD: Implement function to reconnect to a SMB Direct transport

2017-11-04 Thread Long Li
From: Long Li 

Add function to implement a reconnect to SMB Direct. This involves tearing
down the current connection and establishing/negotiating a new connection.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 47d999f..f3ae3dc 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1393,6 +1393,42 @@ static void idle_connection_timer(struct work_struct 
*work)
info->keep_alive_interval*HZ);
 }
 
+/*
+ * Reconnect this SMBD connection, called from upper layer
+ * return value: 0 on success, or actual error code
+ */
+int smbd_reconnect(struct TCP_Server_Info *server)
+{
+   log_rdma_event(INFO, "reconnecting rdma session\n");
+
+   if (!server->smbd_conn) {
+   log_rdma_event(ERR, "rdma session already destroyed\n");
+   return -EINVAL;
+   }
+
+   /*
+* This is possible if transport is disconnected and we haven't received
+* notification from RDMA, but upper layer has detected timeout
+*/
+   if (server->smbd_conn->transport_status == SMBD_CONNECTED) {
+   log_rdma_event(INFO, "disconnecting transport\n");
+   smbd_disconnect_rdma_connection(server->smbd_conn);
+   }
+
+   /* wait until the transport is destroyed */
+   wait_event(server->smbd_conn->wait_destroy,
+   server->smbd_conn->transport_status == SMBD_DESTROYED);
+
+   destroy_workqueue(server->smbd_conn->workqueue);
+   kfree(server->smbd_conn);
+
+   log_rdma_event(INFO, "creating rdma session\n");
+   server->smbd_conn = smbd_get_connection(
+   server, (struct sockaddr *) >dstaddr);
+
+   return server->smbd_conn ? 0 : -ENOENT;
+}
+
 static void destroy_caches_and_workqueue(struct smbd_connection *info)
 {
destroy_receive_buffers(info);
-- 
2.7.4



[Patch v6 12/22] CIFS: SMBD: Upper layer destroys SMB Direct session on shutdown or umount

2017-11-04 Thread Long Li
From: Long Li 

When upper layer wants to umount, make it call shutdown on transport when
SMB Direct is used.

Signed-off-by: Long Li 
---
 fs/cifs/connect.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8ca3c13..23f10d1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -707,7 +707,12 @@ static void clean_demultiplex_info(struct TCP_Server_Info 
*server)
wake_up_all(>request_q);
/* give those requests time to exit */
msleep(125);
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (server->smbd_conn) {
+   smbd_destroy(server->smbd_conn);
+   server->smbd_conn = NULL;
+   }
+#endif
if (server->ssocket) {
sock_release(server->ssocket);
server->ssocket = NULL;
-- 
2.7.4



[Patch v6 11/22] CIFS: SMBD: Implement function to destroy a SMB Direct connection

2017-11-04 Thread Long Li
From: Long Li 

Add function to tear down a SMB Direct connection. This is used by upper
layer to free all SMB Direct connection and transport resources.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index f3ae3dc..5952276 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1393,6 +1393,22 @@ static void idle_connection_timer(struct work_struct 
*work)
info->keep_alive_interval*HZ);
 }
 
+/* Destroy this SMBD connection, called from upper layer */
+void smbd_destroy(struct smbd_connection *info)
+{
+   log_rdma_event(INFO, "destroying rdma session\n");
+
+   /* Kick off the disconnection process */
+   smbd_disconnect_rdma_connection(info);
+
+   log_rdma_event(INFO, "wait for transport being destroyed\n");
+   wait_event(info->wait_destroy,
+   info->transport_status == SMBD_DESTROYED);
+
+   destroy_workqueue(info->workqueue);
+   kfree(info);
+}
+
 /*
  * Reconnect this SMBD connection, called from upper layer
  * return value: 0 on success, or actual error code
-- 
2.7.4



[Patch v6 12/22] CIFS: SMBD: Upper layer destroys SMB Direct session on shutdown or umount

2017-11-04 Thread Long Li
From: Long Li 

When upper layer wants to umount, make it call shutdown on transport when
SMB Direct is used.

Signed-off-by: Long Li 
---
 fs/cifs/connect.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8ca3c13..23f10d1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -707,7 +707,12 @@ static void clean_demultiplex_info(struct TCP_Server_Info 
*server)
wake_up_all(>request_q);
/* give those requests time to exit */
msleep(125);
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (server->smbd_conn) {
+   smbd_destroy(server->smbd_conn);
+   server->smbd_conn = NULL;
+   }
+#endif
if (server->ssocket) {
sock_release(server->ssocket);
server->ssocket = NULL;
-- 
2.7.4



[Patch v6 11/22] CIFS: SMBD: Implement function to destroy a SMB Direct connection

2017-11-04 Thread Long Li
From: Long Li 

Add function to tear down a SMB Direct connection. This is used by upper
layer to free all SMB Direct connection and transport resources.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index f3ae3dc..5952276 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1393,6 +1393,22 @@ static void idle_connection_timer(struct work_struct 
*work)
info->keep_alive_interval*HZ);
 }
 
+/* Destroy this SMBD connection, called from upper layer */
+void smbd_destroy(struct smbd_connection *info)
+{
+   log_rdma_event(INFO, "destroying rdma session\n");
+
+   /* Kick off the disconnection process */
+   smbd_disconnect_rdma_connection(info);
+
+   log_rdma_event(INFO, "wait for transport being destroyed\n");
+   wait_event(info->wait_destroy,
+   info->transport_status == SMBD_DESTROYED);
+
+   destroy_workqueue(info->workqueue);
+   kfree(info);
+}
+
 /*
  * Reconnect this SMBD connection, called from upper layer
  * return value: 0 on success, or actual error code
-- 
2.7.4



[Patch v6 22/22] CIFS: SMBD: Add SMB Direct debug counters

2017-11-04 Thread Long Li
From: Long Li 

For debugging and troubleshooting, export SMBDirect debug counters to
/proc/fs/cifs/DebugData.

Signed-off-by: Long Li 
---
 fs/cifs/cifs_debug.c | 66 
 1 file changed, 66 insertions(+)

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 7025d8d..cd65759 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -155,6 +155,72 @@ static int cifs_debug_data_proc_show(struct seq_file *m, 
void *v)
list_for_each(tmp1, _tcp_ses_list) {
server = list_entry(tmp1, struct TCP_Server_Info,
tcp_ses_list);
+
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (!server->rdma)
+   goto skip_rdma;
+
+   seq_printf(m, "\nSMBDirect (in hex) protocol version: %x "
+   "transport status: %x",
+   server->smbd_conn->protocol,
+   server->smbd_conn->transport_status);
+   seq_printf(m, "\nConn receive_credit_max: %x "
+   "send_credit_target: %x max_send_size: %x",
+   server->smbd_conn->receive_credit_max,
+   server->smbd_conn->send_credit_target,
+   server->smbd_conn->max_send_size);
+   seq_printf(m, "\nConn max_fragmented_recv_size: %x "
+   "max_fragmented_send_size: %x max_receive_size:%x",
+   server->smbd_conn->max_fragmented_recv_size,
+   server->smbd_conn->max_fragmented_send_size,
+   server->smbd_conn->max_receive_size);
+   seq_printf(m, "\nConn keep_alive_interval: %x "
+   "max_readwrite_size: %x rdma_readwrite_threshold: %x",
+   server->smbd_conn->keep_alive_interval,
+   server->smbd_conn->max_readwrite_size,
+   server->smbd_conn->rdma_readwrite_threshold);
+   seq_printf(m, "\nDebug count_get_receive_buffer: %x "
+   "count_put_receive_buffer: %x count_send_empty: %x",
+   server->smbd_conn->count_get_receive_buffer,
+   server->smbd_conn->count_put_receive_buffer,
+   server->smbd_conn->count_send_empty);
+   seq_printf(m, "\nRead Queue count_reassembly_queue: %x "
+   "count_enqueue_reassembly_queue: %x "
+   "count_dequeue_reassembly_queue: %x "
+   "fragment_reassembly_remaining: %x "
+   "reassembly_data_length: %x "
+   "reassembly_queue_length: %x",
+   server->smbd_conn->count_reassembly_queue,
+   server->smbd_conn->count_enqueue_reassembly_queue,
+   server->smbd_conn->count_dequeue_reassembly_queue,
+   server->smbd_conn->fragment_reassembly_remaining,
+   server->smbd_conn->reassembly_data_length,
+   server->smbd_conn->reassembly_queue_length);
+   seq_printf(m, "\nCurrent Credits send_credits: %x "
+   "receive_credits: %x receive_credit_target: %x",
+   atomic_read(>smbd_conn->send_credits),
+   atomic_read(>smbd_conn->receive_credits),
+   server->smbd_conn->receive_credit_target);
+   seq_printf(m, "\nPending send_pending: %x send_payload_pending:"
+   " %x smbd_send_pending: %x smbd_recv_pending: %x",
+   atomic_read(>smbd_conn->send_pending),
+   atomic_read(>smbd_conn->send_payload_pending),
+   server->smbd_conn->smbd_send_pending,
+   server->smbd_conn->smbd_recv_pending);
+   seq_printf(m, "\nReceive buffers count_receive_queue: %x "
+   "count_empty_packet_queue: %x",
+   server->smbd_conn->count_receive_queue,
+   server->smbd_conn->count_empty_packet_queue);
+   seq_printf(m, "\nMR responder_resources: %x "
+   "max_frmr_depth: %x mr_type: %x",
+   server->smbd_conn->responder_resources,
+   server->smbd_conn->max_frmr_depth,
+   server->smbd_conn->mr_type);
+   seq_printf(m, "\nMR mr_ready_count: %x mr_used_count: %x",
+   atomic_read(>smbd_conn->mr_ready_count),
+   atomic_read(>smbd_conn->mr_used_count));
+skip_rdma:
+#endif
seq_printf(m, "\nNumber of credits: %d", server->credits);
i++;
list_for_each(tmp2, >smb_ses_list) {
-- 
2.7.4



[Patch v6 00/22] CIFS: Implement SMB Direct protocol

2017-11-04 Thread Long Li
From: Long Li 

Starting with SMB2 dialect 3.0, Microsoft introduced SMB Direct transport
protocol for transferring upper layer (SMB2) payload over RDMA via Infiniband,
RoCE or iWARP. The prococol is published in [MS-SMBD]
(https://msdn.microsoft.com/en-us/library/hh536346.aspx).

Patch v2 added RDMA read/write via memory registration, and addressed
feedbacks on v1.

Patch v3 improved performance by introducing an additional queue for handling
empty packets and reducing lock contention on IRQ path. Also added light
weight profiling by reading TSC and addressed feedbacks on v2.

Patch v4 fixed connectivity issues with iWAPR devices and addressed comments.

Patch v5 fixed compiling errors on ia64, i386 and when INFINIBAND is not
configured, and addressed comments. Profiling is removed and will be
introduced in a seperate patch.

Patch v6 addressed comments.

Long Li (22):
  CIFS: SMBD: Add parameter rdata to smb2_new_read_req
  CIFS: SMBD: Introduce kernel config option CONFIG_CIFS_SMB_DIRECT
  CIFS: SMBD: Add rdma mount option
  CIFS: SMBD: Add SMB Direct protocol initial values and constants
  CIFS: SMBD: Establish SMB Direct connection
  CIFS: SMBD: export protocol initial values
  CIFS: SMBD: Implement function to create a SMB Direct connection
  CIFS: SMBD: Upper layer connects to SMBDirect session
  CIFS: SMBD: Implement function to reconnect to a SMB Direct transport
  CIFS: SMBD: Upper layer reconnects to SMB Direct session
  CIFS: SMBD: Implement function to destroy a SMB Direct connection
  CIFS: SMBD: Upper layer destroys SMB Direct session on shutdown or
umount
  CIFS: SMBD: Set SMB Direct maximum read or write size for I/O
  CIFS: SMBD: Implement function to receive data via RDMA receive
  CIFS: SMBD: Upper layer receives data via RDMA receive
  CIFS: SMBD: Implement function to send data via RDMA send
  CIFS: SMBD: Upper layer sends data via RDMA send
  CIFS: SMBD: Implement RDMA memory registration
  CIFS: SMBD: Upper layer performs SMB write via RDMA read through
memory registration
  CIFS: SMBD: Read correct returned data length for RDMA write (SMB
read) I/O
  CIFS: SMBD: Upper layer performs SMB read via RDMA write through
memory registration
  CIFS: SMBD: Add SMB Direct debug counters

 fs/cifs/Kconfig  |8 +
 fs/cifs/Makefile |2 +
 fs/cifs/cifs_debug.c |  147 +++
 fs/cifs/cifsfs.c |2 +
 fs/cifs/cifsglob.h   |   23 +-
 fs/cifs/cifssmb.c|   16 +-
 fs/cifs/connect.c|   64 +-
 fs/cifs/file.c   |   19 +-
 fs/cifs/smb1ops.c|4 +-
 fs/cifs/smb2ops.c|   26 +-
 fs/cifs/smb2pdu.c|  129 ++-
 fs/cifs/smbdirect.c  | 2618 ++
 fs/cifs/smbdirect.h  |  315 ++
 fs/cifs/transport.c  |   14 +-
 14 files changed, 3359 insertions(+), 28 deletions(-)
 create mode 100644 fs/cifs/smbdirect.c
 create mode 100644 fs/cifs/smbdirect.h

-- 
2.7.4



[Patch v6 01/22] CIFS: SMBD: Add parameter rdata to smb2_new_read_req

2017-11-04 Thread Long Li
From: Long Li 

This patch is for preparing upper layer for doing SMB read via RDMA write.

When the SMB read packet header is assembled, SMB Direct code needs to know
the I/O layout if this request is to use a RDMA write. rdata has all the
information of I/O layout for memory registration.

Add rdata to smb2_new_read_req.

Signed-off-by: Long Li 
---
 fs/cifs/smb2pdu.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index bab3da6..32ad590 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2350,18 +2350,21 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon 
*tcon, u64 persistent_fid,
  */
 static int
 smb2_new_read_req(void **buf, unsigned int *total_len,
- struct cifs_io_parms *io_parms, unsigned int remaining_bytes,
- int request_type)
+   struct cifs_io_parms *io_parms, struct cifs_readdata *rdata,
+   unsigned int remaining_bytes, int request_type)
 {
int rc = -EACCES;
struct smb2_read_plain_req *req = NULL;
struct smb2_sync_hdr *shdr;
+   struct TCP_Server_Info *server;
 
rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, (void **) ,
 total_len);
if (rc)
return rc;
-   if (io_parms->tcon->ses->server == NULL)
+
+   server = io_parms->tcon->ses->server;
+   if (server == NULL)
return -ECONNABORTED;
 
shdr = >sync_hdr;
@@ -2489,7 +2492,8 @@ smb2_async_readv(struct cifs_readdata *rdata)
 
server = io_parms.tcon->ses->server;
 
-   rc = smb2_new_read_req((void **) , _len, _parms, 0, 0);
+   rc = smb2_new_read_req(
+   (void **) , _len, _parms, rdata, 0, 0);
if (rc) {
if (rc == -EAGAIN && rdata->credits) {
/* credits was reset by reconnect */
@@ -2557,7 +2561,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms 
*io_parms,
struct cifs_ses *ses = io_parms->tcon->ses;
 
*nbytes = 0;
-   rc = smb2_new_read_req((void **), _len, io_parms, 0, 0);
+   rc = smb2_new_read_req((void **), _len, io_parms, NULL, 0, 0);
if (rc)
return rc;
 
-- 
2.7.4



[Patch v6 17/22] CIFS: SMBD: Upper layer sends data via RDMA send

2017-11-04 Thread Long Li
From: Long Li 

With SMB Direct connected, use it for sending data via RDMA send.

Signed-off-by: Long Li 
---
 fs/cifs/transport.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7efbab0..1b51d08 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -37,6 +37,9 @@
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 void
 cifs_wake_up_task(struct mid_q_entry *mid)
@@ -229,7 +232,12 @@ __smb_send_rqst(struct TCP_Server_Info *server, struct 
smb_rqst *rqst)
struct socket *ssocket = server->ssocket;
struct msghdr smb_msg;
int val = 1;
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (server->smbd_conn) {
+   rc = smbd_send(server->smbd_conn, rqst);
+   goto smbd_done;
+   }
+#endif
if (ssocket == NULL)
return -ENOTSOCK;
 
@@ -298,7 +306,9 @@ __smb_send_rqst(struct TCP_Server_Info *server, struct 
smb_rqst *rqst)
 */
server->tcpStatus = CifsNeedReconnect;
}
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+smbd_done:
+#endif
if (rc < 0 && rc != -EINTR)
cifs_dbg(VFS, "Error %d sending data on socket to server\n",
 rc);
-- 
2.7.4



[Patch v6 22/22] CIFS: SMBD: Add SMB Direct debug counters

2017-11-04 Thread Long Li
From: Long Li 

For debugging and troubleshooting, export SMBDirect debug counters to
/proc/fs/cifs/DebugData.

Signed-off-by: Long Li 
---
 fs/cifs/cifs_debug.c | 66 
 1 file changed, 66 insertions(+)

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 7025d8d..cd65759 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -155,6 +155,72 @@ static int cifs_debug_data_proc_show(struct seq_file *m, 
void *v)
list_for_each(tmp1, _tcp_ses_list) {
server = list_entry(tmp1, struct TCP_Server_Info,
tcp_ses_list);
+
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (!server->rdma)
+   goto skip_rdma;
+
+   seq_printf(m, "\nSMBDirect (in hex) protocol version: %x "
+   "transport status: %x",
+   server->smbd_conn->protocol,
+   server->smbd_conn->transport_status);
+   seq_printf(m, "\nConn receive_credit_max: %x "
+   "send_credit_target: %x max_send_size: %x",
+   server->smbd_conn->receive_credit_max,
+   server->smbd_conn->send_credit_target,
+   server->smbd_conn->max_send_size);
+   seq_printf(m, "\nConn max_fragmented_recv_size: %x "
+   "max_fragmented_send_size: %x max_receive_size:%x",
+   server->smbd_conn->max_fragmented_recv_size,
+   server->smbd_conn->max_fragmented_send_size,
+   server->smbd_conn->max_receive_size);
+   seq_printf(m, "\nConn keep_alive_interval: %x "
+   "max_readwrite_size: %x rdma_readwrite_threshold: %x",
+   server->smbd_conn->keep_alive_interval,
+   server->smbd_conn->max_readwrite_size,
+   server->smbd_conn->rdma_readwrite_threshold);
+   seq_printf(m, "\nDebug count_get_receive_buffer: %x "
+   "count_put_receive_buffer: %x count_send_empty: %x",
+   server->smbd_conn->count_get_receive_buffer,
+   server->smbd_conn->count_put_receive_buffer,
+   server->smbd_conn->count_send_empty);
+   seq_printf(m, "\nRead Queue count_reassembly_queue: %x "
+   "count_enqueue_reassembly_queue: %x "
+   "count_dequeue_reassembly_queue: %x "
+   "fragment_reassembly_remaining: %x "
+   "reassembly_data_length: %x "
+   "reassembly_queue_length: %x",
+   server->smbd_conn->count_reassembly_queue,
+   server->smbd_conn->count_enqueue_reassembly_queue,
+   server->smbd_conn->count_dequeue_reassembly_queue,
+   server->smbd_conn->fragment_reassembly_remaining,
+   server->smbd_conn->reassembly_data_length,
+   server->smbd_conn->reassembly_queue_length);
+   seq_printf(m, "\nCurrent Credits send_credits: %x "
+   "receive_credits: %x receive_credit_target: %x",
+   atomic_read(>smbd_conn->send_credits),
+   atomic_read(>smbd_conn->receive_credits),
+   server->smbd_conn->receive_credit_target);
+   seq_printf(m, "\nPending send_pending: %x send_payload_pending:"
+   " %x smbd_send_pending: %x smbd_recv_pending: %x",
+   atomic_read(>smbd_conn->send_pending),
+   atomic_read(>smbd_conn->send_payload_pending),
+   server->smbd_conn->smbd_send_pending,
+   server->smbd_conn->smbd_recv_pending);
+   seq_printf(m, "\nReceive buffers count_receive_queue: %x "
+   "count_empty_packet_queue: %x",
+   server->smbd_conn->count_receive_queue,
+   server->smbd_conn->count_empty_packet_queue);
+   seq_printf(m, "\nMR responder_resources: %x "
+   "max_frmr_depth: %x mr_type: %x",
+   server->smbd_conn->responder_resources,
+   server->smbd_conn->max_frmr_depth,
+   server->smbd_conn->mr_type);
+   seq_printf(m, "\nMR mr_ready_count: %x mr_used_count: %x",
+   atomic_read(>smbd_conn->mr_ready_count),
+   atomic_read(>smbd_conn->mr_used_count));
+skip_rdma:
+#endif
seq_printf(m, "\nNumber of credits: %d", server->credits);
i++;
list_for_each(tmp2, >smb_ses_list) {
-- 
2.7.4



[Patch v6 00/22] CIFS: Implement SMB Direct protocol

2017-11-04 Thread Long Li
From: Long Li 

Starting with SMB2 dialect 3.0, Microsoft introduced SMB Direct transport
protocol for transferring upper layer (SMB2) payload over RDMA via Infiniband,
RoCE or iWARP. The prococol is published in [MS-SMBD]
(https://msdn.microsoft.com/en-us/library/hh536346.aspx).

Patch v2 added RDMA read/write via memory registration, and addressed
feedbacks on v1.

Patch v3 improved performance by introducing an additional queue for handling
empty packets and reducing lock contention on IRQ path. Also added light
weight profiling by reading TSC and addressed feedbacks on v2.

Patch v4 fixed connectivity issues with iWAPR devices and addressed comments.

Patch v5 fixed compiling errors on ia64, i386 and when INFINIBAND is not
configured, and addressed comments. Profiling is removed and will be
introduced in a seperate patch.

Patch v6 addressed comments.

Long Li (22):
  CIFS: SMBD: Add parameter rdata to smb2_new_read_req
  CIFS: SMBD: Introduce kernel config option CONFIG_CIFS_SMB_DIRECT
  CIFS: SMBD: Add rdma mount option
  CIFS: SMBD: Add SMB Direct protocol initial values and constants
  CIFS: SMBD: Establish SMB Direct connection
  CIFS: SMBD: export protocol initial values
  CIFS: SMBD: Implement function to create a SMB Direct connection
  CIFS: SMBD: Upper layer connects to SMBDirect session
  CIFS: SMBD: Implement function to reconnect to a SMB Direct transport
  CIFS: SMBD: Upper layer reconnects to SMB Direct session
  CIFS: SMBD: Implement function to destroy a SMB Direct connection
  CIFS: SMBD: Upper layer destroys SMB Direct session on shutdown or
umount
  CIFS: SMBD: Set SMB Direct maximum read or write size for I/O
  CIFS: SMBD: Implement function to receive data via RDMA receive
  CIFS: SMBD: Upper layer receives data via RDMA receive
  CIFS: SMBD: Implement function to send data via RDMA send
  CIFS: SMBD: Upper layer sends data via RDMA send
  CIFS: SMBD: Implement RDMA memory registration
  CIFS: SMBD: Upper layer performs SMB write via RDMA read through
memory registration
  CIFS: SMBD: Read correct returned data length for RDMA write (SMB
read) I/O
  CIFS: SMBD: Upper layer performs SMB read via RDMA write through
memory registration
  CIFS: SMBD: Add SMB Direct debug counters

 fs/cifs/Kconfig  |8 +
 fs/cifs/Makefile |2 +
 fs/cifs/cifs_debug.c |  147 +++
 fs/cifs/cifsfs.c |2 +
 fs/cifs/cifsglob.h   |   23 +-
 fs/cifs/cifssmb.c|   16 +-
 fs/cifs/connect.c|   64 +-
 fs/cifs/file.c   |   19 +-
 fs/cifs/smb1ops.c|4 +-
 fs/cifs/smb2ops.c|   26 +-
 fs/cifs/smb2pdu.c|  129 ++-
 fs/cifs/smbdirect.c  | 2618 ++
 fs/cifs/smbdirect.h  |  315 ++
 fs/cifs/transport.c  |   14 +-
 14 files changed, 3359 insertions(+), 28 deletions(-)
 create mode 100644 fs/cifs/smbdirect.c
 create mode 100644 fs/cifs/smbdirect.h

-- 
2.7.4



[Patch v6 01/22] CIFS: SMBD: Add parameter rdata to smb2_new_read_req

2017-11-04 Thread Long Li
From: Long Li 

This patch is for preparing upper layer for doing SMB read via RDMA write.

When the SMB read packet header is assembled, SMB Direct code needs to know
the I/O layout if this request is to use a RDMA write. rdata has all the
information of I/O layout for memory registration.

Add rdata to smb2_new_read_req.

Signed-off-by: Long Li 
---
 fs/cifs/smb2pdu.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index bab3da6..32ad590 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2350,18 +2350,21 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon 
*tcon, u64 persistent_fid,
  */
 static int
 smb2_new_read_req(void **buf, unsigned int *total_len,
- struct cifs_io_parms *io_parms, unsigned int remaining_bytes,
- int request_type)
+   struct cifs_io_parms *io_parms, struct cifs_readdata *rdata,
+   unsigned int remaining_bytes, int request_type)
 {
int rc = -EACCES;
struct smb2_read_plain_req *req = NULL;
struct smb2_sync_hdr *shdr;
+   struct TCP_Server_Info *server;
 
rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, (void **) ,
 total_len);
if (rc)
return rc;
-   if (io_parms->tcon->ses->server == NULL)
+
+   server = io_parms->tcon->ses->server;
+   if (server == NULL)
return -ECONNABORTED;
 
shdr = >sync_hdr;
@@ -2489,7 +2492,8 @@ smb2_async_readv(struct cifs_readdata *rdata)
 
server = io_parms.tcon->ses->server;
 
-   rc = smb2_new_read_req((void **) , _len, _parms, 0, 0);
+   rc = smb2_new_read_req(
+   (void **) , _len, _parms, rdata, 0, 0);
if (rc) {
if (rc == -EAGAIN && rdata->credits) {
/* credits was reset by reconnect */
@@ -2557,7 +2561,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms 
*io_parms,
struct cifs_ses *ses = io_parms->tcon->ses;
 
*nbytes = 0;
-   rc = smb2_new_read_req((void **), _len, io_parms, 0, 0);
+   rc = smb2_new_read_req((void **), _len, io_parms, NULL, 0, 0);
if (rc)
return rc;
 
-- 
2.7.4



[Patch v6 17/22] CIFS: SMBD: Upper layer sends data via RDMA send

2017-11-04 Thread Long Li
From: Long Li 

With SMB Direct connected, use it for sending data via RDMA send.

Signed-off-by: Long Li 
---
 fs/cifs/transport.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7efbab0..1b51d08 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -37,6 +37,9 @@
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 void
 cifs_wake_up_task(struct mid_q_entry *mid)
@@ -229,7 +232,12 @@ __smb_send_rqst(struct TCP_Server_Info *server, struct 
smb_rqst *rqst)
struct socket *ssocket = server->ssocket;
struct msghdr smb_msg;
int val = 1;
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (server->smbd_conn) {
+   rc = smbd_send(server->smbd_conn, rqst);
+   goto smbd_done;
+   }
+#endif
if (ssocket == NULL)
return -ENOTSOCK;
 
@@ -298,7 +306,9 @@ __smb_send_rqst(struct TCP_Server_Info *server, struct 
smb_rqst *rqst)
 */
server->tcpStatus = CifsNeedReconnect;
}
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+smbd_done:
+#endif
if (rc < 0 && rc != -EINTR)
cifs_dbg(VFS, "Error %d sending data on socket to server\n",
 rc);
-- 
2.7.4



[Patch v6 08/22] CIFS: SMBD: Upper layer connects to SMBDirect session

2017-11-04 Thread Long Li
From: Long Li 

When "rdma" is specified in the mount option, make CIFS connect to
SMB Direct.

Signed-off-by: Long Li 
---
 fs/cifs/connect.c | 27 ---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index b5a575f..2c0b34a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -44,7 +44,6 @@
 #include 
 #include 
 #include 
-
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -56,6 +55,9 @@
 #include "rfc1002pdu.h"
 #include "fscache.h"
 #include "smb2proto.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
@@ -2279,13 +2281,32 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
tcp_ses->echo_interval = volume_info->echo_interval * HZ;
else
tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ;
-
+   if (tcp_ses->rdma) {
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   tcp_ses->smbd_conn = smbd_get_connection(
+   tcp_ses, (struct sockaddr *)_info->dstaddr);
+   if (tcp_ses->smbd_conn) {
+   cifs_dbg(VFS, "RDMA transport established\n");
+   rc = 0;
+   goto smbd_connected;
+   } else {
+   rc = -ENOENT;
+   goto out_err_crypto_release;
+   }
+#else
+   cifs_dbg(VFS, "CONFIG_CIFS_SMB_DIRECT is not enabled\n");
+   rc = -ENOENT;
+   goto out_err_crypto_release;
+#endif
+   }
rc = ip_connect(tcp_ses);
if (rc < 0) {
cifs_dbg(VFS, "Error connecting to socket. Aborting 
operation.\n");
goto out_err_crypto_release;
}
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+smbd_connected:
+#endif
/*
 * since we're in a cifs function already, we know that
 * this will succeed. No need for try_module_get().
-- 
2.7.4



[Patch v6 08/22] CIFS: SMBD: Upper layer connects to SMBDirect session

2017-11-04 Thread Long Li
From: Long Li 

When "rdma" is specified in the mount option, make CIFS connect to
SMB Direct.

Signed-off-by: Long Li 
---
 fs/cifs/connect.c | 27 ---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index b5a575f..2c0b34a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -44,7 +44,6 @@
 #include 
 #include 
 #include 
-
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -56,6 +55,9 @@
 #include "rfc1002pdu.h"
 #include "fscache.h"
 #include "smb2proto.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
@@ -2279,13 +2281,32 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
tcp_ses->echo_interval = volume_info->echo_interval * HZ;
else
tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ;
-
+   if (tcp_ses->rdma) {
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   tcp_ses->smbd_conn = smbd_get_connection(
+   tcp_ses, (struct sockaddr *)_info->dstaddr);
+   if (tcp_ses->smbd_conn) {
+   cifs_dbg(VFS, "RDMA transport established\n");
+   rc = 0;
+   goto smbd_connected;
+   } else {
+   rc = -ENOENT;
+   goto out_err_crypto_release;
+   }
+#else
+   cifs_dbg(VFS, "CONFIG_CIFS_SMB_DIRECT is not enabled\n");
+   rc = -ENOENT;
+   goto out_err_crypto_release;
+#endif
+   }
rc = ip_connect(tcp_ses);
if (rc < 0) {
cifs_dbg(VFS, "Error connecting to socket. Aborting 
operation.\n");
goto out_err_crypto_release;
}
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+smbd_connected:
+#endif
/*
 * since we're in a cifs function already, we know that
 * this will succeed. No need for try_module_get().
-- 
2.7.4



[Patch v6 15/22] CIFS: SMBD: Upper layer receives data via RDMA receive

2017-11-04 Thread Long Li
From: Long Li 

With SMB Direct connected, use it for receiving data via RDMA receive.

Signed-off-by: Long Li 
---
 fs/cifs/connect.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 23f10d1..6325062 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -545,8 +545,14 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, 
struct msghdr *smb_msg)
 
if (server_unresponsive(server))
return -ECONNABORTED;
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (server->smbd_conn)
+   length = smbd_recv(server->smbd_conn, smb_msg);
+   else
+   length = sock_recvmsg(server->ssocket, smb_msg, 0);
+#else
length = sock_recvmsg(server->ssocket, smb_msg, 0);
+#endif
 
if (server->tcpStatus == CifsExiting)
return -ESHUTDOWN;
-- 
2.7.4



[Patch v6 15/22] CIFS: SMBD: Upper layer receives data via RDMA receive

2017-11-04 Thread Long Li
From: Long Li 

With SMB Direct connected, use it for receiving data via RDMA receive.

Signed-off-by: Long Li 
---
 fs/cifs/connect.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 23f10d1..6325062 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -545,8 +545,14 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, 
struct msghdr *smb_msg)
 
if (server_unresponsive(server))
return -ECONNABORTED;
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (server->smbd_conn)
+   length = smbd_recv(server->smbd_conn, smb_msg);
+   else
+   length = sock_recvmsg(server->ssocket, smb_msg, 0);
+#else
length = sock_recvmsg(server->ssocket, smb_msg, 0);
+#endif
 
if (server->tcpStatus == CifsExiting)
return -ESHUTDOWN;
-- 
2.7.4



[Patch v6 16/22] CIFS: SMBD: Implement function to send data via RDMA send

2017-11-04 Thread Long Li
From: Long Li 

The transport doesn't maintain send buffers or send queue for transferring
payload via RDMA send. There is no data copy in the transport on send.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 246 
 fs/cifs/smbdirect.h |   3 +
 2 files changed, 249 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 1e7f5df..6089ae7 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -42,6 +42,12 @@ static int smbd_post_recv(
struct smbd_response *response);
 
 static int smbd_post_send_empty(struct smbd_connection *info);
+static int smbd_post_send_data(
+   struct smbd_connection *info,
+   struct kvec *iov, int n_vec, int remaining_data_length);
+static int smbd_post_send_page(struct smbd_connection *info,
+   struct page *page, unsigned long offset,
+   size_t size, int remaining_data_length);
 
 /* SMBD version number */
 #define SMBD_V10x0100
@@ -178,6 +184,10 @@ static void smbd_destroy_rdma_work(struct work_struct 
*work)
log_rdma_event(INFO, "cancelling send immediate work\n");
cancel_delayed_work_sync(>send_immediate_work);
 
+   log_rdma_event(INFO, "wait for all send to finish\n");
+   wait_event(info->wait_smbd_send_pending,
+   info->smbd_send_pending == 0);
+
log_rdma_event(INFO, "wait for all recv to finish\n");
wake_up_interruptible(>wait_reassembly_queue);
wait_event(info->wait_smbd_recv_pending,
@@ -1081,6 +1091,24 @@ static int smbd_post_send_sgl(struct smbd_connection 
*info,
 }
 
 /*
+ * Send a page
+ * page: the page to send
+ * offset: offset in the page to send
+ * size: length in the page to send
+ * remaining_data_length: remaining data to send in this payload
+ */
+static int smbd_post_send_page(struct smbd_connection *info, struct page *page,
+   unsigned long offset, size_t size, int remaining_data_length)
+{
+   struct scatterlist sgl;
+
+   sg_init_table(, 1);
+   sg_set_page(, page, size, offset);
+
+   return smbd_post_send_sgl(info, , size, remaining_data_length);
+}
+
+/*
  * Send an empty message
  * Empty message is used to extend credits to peer to for keep live
  * while there is no upper layer payload to send at the time
@@ -1092,6 +1120,35 @@ static int smbd_post_send_empty(struct smbd_connection 
*info)
 }
 
 /*
+ * Send a data buffer
+ * iov: the iov array describing the data buffers
+ * n_vec: number of iov array
+ * remaining_data_length: remaining data to send following this packet
+ * in segmented SMBD packet
+ */
+static int smbd_post_send_data(
+   struct smbd_connection *info, struct kvec *iov, int n_vec,
+   int remaining_data_length)
+{
+   int i;
+   u32 data_length = 0;
+   struct scatterlist sgl[SMBDIRECT_MAX_SGE];
+
+   if (n_vec > SMBDIRECT_MAX_SGE) {
+   cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
+   return -ENOMEM;
+   }
+
+   sg_init_table(sgl, n_vec);
+   for (i = 0; i < n_vec; i++) {
+   data_length += iov[i].iov_len;
+   sg_set_buf([i], iov[i].iov_base, iov[i].iov_len);
+   }
+
+   return smbd_post_send_sgl(info, sgl, data_length, 
remaining_data_length);
+}
+
+/*
  * Post a receive request to the transport
  * The remote peer can only send data when a receive request is posted
  * The interaction is controlled by send/receive credit system
@@ -1658,6 +1715,9 @@ struct smbd_connection *_smbd_get_connection(
queue_delayed_work(info->workqueue, >idle_timer_work,
info->keep_alive_interval*HZ);
 
+   init_waitqueue_head(>wait_smbd_send_pending);
+   info->smbd_send_pending = 0;
+
init_waitqueue_head(>wait_smbd_recv_pending);
info->smbd_recv_pending = 0;
 
@@ -1949,3 +2009,189 @@ int smbd_recv(struct smbd_connection *info, struct 
msghdr *msg)
msg->msg_iter.count = 0;
return rc;
 }
+
+/*
+ * Send data to transport
+ * Each rqst is transported as a SMBDirect payload
+ * rqst: the data to write
+ * return value: 0 if successfully write, otherwise error code
+ */
+int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
+{
+   struct kvec vec;
+   int nvecs;
+   int size;
+   int buflen = 0, remaining_data_length;
+   int start, i, j;
+   int max_iov_size =
+   info->max_send_size - sizeof(struct smbd_data_transfer);
+   struct kvec iov[SMBDIRECT_MAX_SGE];
+   int rc;
+
+   info->smbd_send_pending++;
+   if (info->transport_status != SMBD_CONNECTED) {
+   rc = -ENODEV;
+   goto done;
+   }
+
+   /*
+* This usually means a configuration error
+* We use RDMA read/write for packet size > rdma_readwrite_threshold
+* as long as it's properly configured we should never get 

[Patch v6 16/22] CIFS: SMBD: Implement function to send data via RDMA send

2017-11-04 Thread Long Li
From: Long Li 

The transport doesn't maintain send buffers or send queue for transferring
payload via RDMA send. There is no data copy in the transport on send.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 246 
 fs/cifs/smbdirect.h |   3 +
 2 files changed, 249 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 1e7f5df..6089ae7 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -42,6 +42,12 @@ static int smbd_post_recv(
struct smbd_response *response);
 
 static int smbd_post_send_empty(struct smbd_connection *info);
+static int smbd_post_send_data(
+   struct smbd_connection *info,
+   struct kvec *iov, int n_vec, int remaining_data_length);
+static int smbd_post_send_page(struct smbd_connection *info,
+   struct page *page, unsigned long offset,
+   size_t size, int remaining_data_length);
 
 /* SMBD version number */
 #define SMBD_V10x0100
@@ -178,6 +184,10 @@ static void smbd_destroy_rdma_work(struct work_struct 
*work)
log_rdma_event(INFO, "cancelling send immediate work\n");
cancel_delayed_work_sync(>send_immediate_work);
 
+   log_rdma_event(INFO, "wait for all send to finish\n");
+   wait_event(info->wait_smbd_send_pending,
+   info->smbd_send_pending == 0);
+
log_rdma_event(INFO, "wait for all recv to finish\n");
wake_up_interruptible(>wait_reassembly_queue);
wait_event(info->wait_smbd_recv_pending,
@@ -1081,6 +1091,24 @@ static int smbd_post_send_sgl(struct smbd_connection 
*info,
 }
 
 /*
+ * Send a page
+ * page: the page to send
+ * offset: offset in the page to send
+ * size: length in the page to send
+ * remaining_data_length: remaining data to send in this payload
+ */
+static int smbd_post_send_page(struct smbd_connection *info, struct page *page,
+   unsigned long offset, size_t size, int remaining_data_length)
+{
+   struct scatterlist sgl;
+
+   sg_init_table(, 1);
+   sg_set_page(, page, size, offset);
+
+   return smbd_post_send_sgl(info, , size, remaining_data_length);
+}
+
+/*
  * Send an empty message
  * Empty message is used to extend credits to peer to for keep live
  * while there is no upper layer payload to send at the time
@@ -1092,6 +1120,35 @@ static int smbd_post_send_empty(struct smbd_connection 
*info)
 }
 
 /*
+ * Send a data buffer
+ * iov: the iov array describing the data buffers
+ * n_vec: number of iov array
+ * remaining_data_length: remaining data to send following this packet
+ * in segmented SMBD packet
+ */
+static int smbd_post_send_data(
+   struct smbd_connection *info, struct kvec *iov, int n_vec,
+   int remaining_data_length)
+{
+   int i;
+   u32 data_length = 0;
+   struct scatterlist sgl[SMBDIRECT_MAX_SGE];
+
+   if (n_vec > SMBDIRECT_MAX_SGE) {
+   cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
+   return -ENOMEM;
+   }
+
+   sg_init_table(sgl, n_vec);
+   for (i = 0; i < n_vec; i++) {
+   data_length += iov[i].iov_len;
+   sg_set_buf([i], iov[i].iov_base, iov[i].iov_len);
+   }
+
+   return smbd_post_send_sgl(info, sgl, data_length, 
remaining_data_length);
+}
+
+/*
  * Post a receive request to the transport
  * The remote peer can only send data when a receive request is posted
  * The interaction is controlled by send/receive credit system
@@ -1658,6 +1715,9 @@ struct smbd_connection *_smbd_get_connection(
queue_delayed_work(info->workqueue, >idle_timer_work,
info->keep_alive_interval*HZ);
 
+   init_waitqueue_head(>wait_smbd_send_pending);
+   info->smbd_send_pending = 0;
+
init_waitqueue_head(>wait_smbd_recv_pending);
info->smbd_recv_pending = 0;
 
@@ -1949,3 +2009,189 @@ int smbd_recv(struct smbd_connection *info, struct 
msghdr *msg)
msg->msg_iter.count = 0;
return rc;
 }
+
+/*
+ * Send data to transport
+ * Each rqst is transported as a SMBDirect payload
+ * rqst: the data to write
+ * return value: 0 if successfully write, otherwise error code
+ */
+int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
+{
+   struct kvec vec;
+   int nvecs;
+   int size;
+   int buflen = 0, remaining_data_length;
+   int start, i, j;
+   int max_iov_size =
+   info->max_send_size - sizeof(struct smbd_data_transfer);
+   struct kvec iov[SMBDIRECT_MAX_SGE];
+   int rc;
+
+   info->smbd_send_pending++;
+   if (info->transport_status != SMBD_CONNECTED) {
+   rc = -ENODEV;
+   goto done;
+   }
+
+   /*
+* This usually means a configuration error
+* We use RDMA read/write for packet size > rdma_readwrite_threshold
+* as long as it's properly configured we should never get into this
+* situation
+*/
+   

[Patch v6 18/22] CIFS: SMBD: Implement RDMA memory registration

2017-11-04 Thread Long Li
From: Long Li 

Memory registration is used for transferring payload via RDMA read or
write.

After I/O is done, memory registrations are recovered and reused. This
process can be time consuming and is done in a work queue at the same time
or after I/O is returned to upper layer.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 421 
 fs/cifs/smbdirect.h |  23 +++
 2 files changed, 444 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 6089ae7..8441a5e 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -49,6 +49,9 @@ static int smbd_post_send_page(struct smbd_connection *info,
struct page *page, unsigned long offset,
size_t size, int remaining_data_length);
 
+static void destroy_mr_list(struct smbd_connection *info);
+static int allocate_mr_list(struct smbd_connection *info);
+
 /* SMBD version number */
 #define SMBD_V10x0100
 
@@ -199,6 +202,12 @@ static void smbd_destroy_rdma_work(struct work_struct 
*work)
wait_event(info->wait_send_payload_pending,
atomic_read(>send_payload_pending) == 0);
 
+   log_rdma_event(INFO, "freeing mr list\n");
+   wake_up_interruptible_all(>wait_mr);
+   wait_event(info->wait_for_mr_cleanup,
+   atomic_read(>mr_used_count) == 0);
+   destroy_mr_list(info);
+
/* It's not posssible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
do {
@@ -456,6 +465,16 @@ static bool process_negotiation_response(
}
info->max_fragmented_send_size =
le32_to_cpu(packet->max_fragmented_size);
+   info->rdma_readwrite_threshold =
+   rdma_readwrite_threshold > info->max_fragmented_send_size ?
+   info->max_fragmented_send_size :
+   rdma_readwrite_threshold;
+
+
+   info->max_readwrite_size = min_t(u32,
+   le32_to_cpu(packet->max_readwrite_size),
+   info->max_frmr_depth * PAGE_SIZE);
+   info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE;
 
return true;
 }
@@ -751,6 +770,12 @@ static int smbd_ia_open(
rc = -EPROTONOSUPPORT;
goto out2;
}
+   info->max_frmr_depth = min_t(int,
+   smbd_max_frmr_depth,
+   info->id->device->attrs.max_fast_reg_page_list_len);
+   info->mr_type = IB_MR_TYPE_MEM_REG;
+   if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+   info->mr_type = IB_MR_TYPE_SG_GAPS;
 
info->pd = ib_alloc_pd(info->id->device, 0);
if (IS_ERR(info->pd)) {
@@ -1588,6 +1613,8 @@ struct smbd_connection *_smbd_get_connection(
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
+   struct ib_port_immutable port_immutable;
+   u32 ird_ord_hdr[2];
 
info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
if (!info)
@@ -1676,6 +1703,28 @@ struct smbd_connection *_smbd_get_connection(
memset(_param, 0, sizeof(conn_param));
conn_param.initiator_depth = 0;
 
+   conn_param.responder_resources =
+   info->id->device->attrs.max_qp_rd_atom
+   < SMBD_CM_RESPONDER_RESOURCES ?
+   info->id->device->attrs.max_qp_rd_atom :
+   SMBD_CM_RESPONDER_RESOURCES;
+   info->responder_resources = conn_param.responder_resources;
+   log_rdma_mr(INFO, "responder_resources=%d\n",
+   info->responder_resources);
+
+   /* Need to send IRD/ORD in private data for iWARP */
+   info->id->device->get_port_immutable(
+   info->id->device, info->id->port_num, _immutable);
+   if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
+   ird_ord_hdr[0] = info->responder_resources;
+   ird_ord_hdr[1] = 1;
+   conn_param.private_data = ird_ord_hdr;
+   conn_param.private_data_len = sizeof(ird_ord_hdr);
+   } else {
+   conn_param.private_data = NULL;
+   conn_param.private_data_len = 0;
+   }
+
conn_param.retry_count = SMBD_CM_RETRY;
conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
conn_param.flow_control = 0;
@@ -1740,8 +1789,19 @@ struct smbd_connection *_smbd_get_connection(
goto negotiation_failed;
}
 
+   rc = allocate_mr_list(info);
+   if (rc) {
+   log_rdma_mr(ERR, "memory registration allocation failed\n");
+   goto allocate_mr_failed;
+   }
+
return info;
 
+allocate_mr_failed:
+   /* At this point, need to a full transport shutdown */
+   smbd_destroy(info);
+   return NULL;
+
 negotiation_failed:
cancel_delayed_work_sync(>idle_timer_work);
 

[Patch v6 05/22] CIFS: SMBD: Establish SMB Direct connection

2017-11-04 Thread Long Li
From: Long Li 

Add code to implement the core functions to establish a SMB Direct
connection.

1. Establish an RDMA connection to SMB server.
2. Negotiate and setup SMB Direct protocol.
3. Implement idle connection timer and credit management.

SMB Direct is enabled by setting CONFIG_CIFS_SMB_DIRECT=y.

Add to Makefile to enable building SMB Direct.

Signed-off-by: Long Li 
---
 fs/cifs/Makefile|2 +
 fs/cifs/smbdirect.c | 1577 +++
 fs/cifs/smbdirect.h |  265 +
 3 files changed, 1844 insertions(+)

diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 5e853a3..ad00873 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -18,3 +18,5 @@ cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
 cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
 
 cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
+
+cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index d3c16f8..7af49cd 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -13,7 +13,34 @@
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  */
+#include 
 #include "smbdirect.h"
+#include "cifs_debug.h"
+
+static struct smbd_response *get_empty_queue_buffer(
+   struct smbd_connection *info);
+static struct smbd_response *get_receive_buffer(
+   struct smbd_connection *info);
+static void put_receive_buffer(
+   struct smbd_connection *info,
+   struct smbd_response *response,
+   bool lock);
+static int allocate_receive_buffers(struct smbd_connection *info, int num_buf);
+static void destroy_receive_buffers(struct smbd_connection *info);
+
+static void put_empty_packet(
+   struct smbd_connection *info, struct smbd_response *response);
+static void enqueue_reassembly(
+   struct smbd_connection *info,
+   struct smbd_response *response, int data_length);
+static struct smbd_response *_get_first_reassembly(
+   struct smbd_connection *info);
+
+static int smbd_post_recv(
+   struct smbd_connection *info,
+   struct smbd_response *response);
+
+static int smbd_post_send_empty(struct smbd_connection *info);
 
 /* SMBD version number */
 #define SMBD_V10x0100
@@ -75,3 +102,1553 @@ int smbd_max_frmr_depth = 2048;
 
 /* If payload is less than this byte, use RDMA send/recv not read/write */
 int rdma_readwrite_threshold = 4096;
+
+/* Transport logging functions
+ * Logging are defined as classes. They can be OR'ed to define the actual
+ * logging level via module parameter smbd_logging_class
+ * e.g. cifs.smbd_logging_class=0x500 will log all log_rdma_recv() and
+ * log_rdma_event()
+ */
+#define LOG_OUTGOING   0x1
+#define LOG_INCOMING   0x2
+#define LOG_READ   0x4
+#define LOG_WRITE  0x8
+#define LOG_RDMA_SEND  0x10
+#define LOG_RDMA_RECV  0x20
+#define LOG_KEEP_ALIVE 0x40
+#define LOG_RDMA_EVENT 0x80
+#define LOG_RDMA_MR0x100
+static unsigned int smbd_logging_class;
+module_param(smbd_logging_class, uint, 0644);
+MODULE_PARM_DESC(smbd_logging_class,
+   "Logging class for SMBD transport 0x0 to 0x100");
+
+#define ERR0x0
+#define INFO   0x1
+static unsigned int smbd_logging_level;
+module_param(smbd_logging_level, uint, 0644);
+MODULE_PARM_DESC(smbd_logging_level,
+   "Logging level for SMBD transport, 0 (default): error, 1: info");
+
+#define log_rdma(level, class, fmt, args...)   \
+do {   \
+   if (level <= smbd_logging_level || class & smbd_logging_class)  \
+   cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
+} while (0)
+
+#define log_outgoing(level, fmt, args...) \
+   log_rdma(level, LOG_OUTGOING, fmt, ##args)
+#define log_incoming(level, fmt, args...) \
+   log_rdma(level, LOG_INCOMING, fmt, ##args)
+#define log_read(level, fmt, args...)  log_rdma(level, LOG_READ, fmt, ##args)
+#define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args)
+#define log_rdma_send(level, fmt, args...) \
+   log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
+#define log_rdma_recv(level, fmt, args...) \
+   log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
+#define log_keep_alive(level, fmt, args...) \
+   log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
+#define log_rdma_event(level, fmt, args...) \
+   log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
+#define log_rdma_mr(level, fmt, args...) \
+   log_rdma(level, LOG_RDMA_MR, fmt, ##args)
+
+/*
+ * Destroy the transport and related RDMA and memory resources
+ * Need to go through 

[Patch v6 18/22] CIFS: SMBD: Implement RDMA memory registration

2017-11-04 Thread Long Li
From: Long Li 

Memory registration is used for transferring payload via RDMA read or
write.

After I/O is done, memory registrations are recovered and reused. This
process can be time consuming and is done in a work queue at the same time
or after I/O is returned to upper layer.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 421 
 fs/cifs/smbdirect.h |  23 +++
 2 files changed, 444 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 6089ae7..8441a5e 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -49,6 +49,9 @@ static int smbd_post_send_page(struct smbd_connection *info,
struct page *page, unsigned long offset,
size_t size, int remaining_data_length);
 
+static void destroy_mr_list(struct smbd_connection *info);
+static int allocate_mr_list(struct smbd_connection *info);
+
 /* SMBD version number */
 #define SMBD_V10x0100
 
@@ -199,6 +202,12 @@ static void smbd_destroy_rdma_work(struct work_struct 
*work)
wait_event(info->wait_send_payload_pending,
atomic_read(>send_payload_pending) == 0);
 
+   log_rdma_event(INFO, "freeing mr list\n");
+   wake_up_interruptible_all(>wait_mr);
+   wait_event(info->wait_for_mr_cleanup,
+   atomic_read(>mr_used_count) == 0);
+   destroy_mr_list(info);
+
/* It's not posssible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
do {
@@ -456,6 +465,16 @@ static bool process_negotiation_response(
}
info->max_fragmented_send_size =
le32_to_cpu(packet->max_fragmented_size);
+   info->rdma_readwrite_threshold =
+   rdma_readwrite_threshold > info->max_fragmented_send_size ?
+   info->max_fragmented_send_size :
+   rdma_readwrite_threshold;
+
+
+   info->max_readwrite_size = min_t(u32,
+   le32_to_cpu(packet->max_readwrite_size),
+   info->max_frmr_depth * PAGE_SIZE);
+   info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE;
 
return true;
 }
@@ -751,6 +770,12 @@ static int smbd_ia_open(
rc = -EPROTONOSUPPORT;
goto out2;
}
+   info->max_frmr_depth = min_t(int,
+   smbd_max_frmr_depth,
+   info->id->device->attrs.max_fast_reg_page_list_len);
+   info->mr_type = IB_MR_TYPE_MEM_REG;
+   if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+   info->mr_type = IB_MR_TYPE_SG_GAPS;
 
info->pd = ib_alloc_pd(info->id->device, 0);
if (IS_ERR(info->pd)) {
@@ -1588,6 +1613,8 @@ struct smbd_connection *_smbd_get_connection(
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
+   struct ib_port_immutable port_immutable;
+   u32 ird_ord_hdr[2];
 
info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
if (!info)
@@ -1676,6 +1703,28 @@ struct smbd_connection *_smbd_get_connection(
memset(_param, 0, sizeof(conn_param));
conn_param.initiator_depth = 0;
 
+   conn_param.responder_resources =
+   info->id->device->attrs.max_qp_rd_atom
+   < SMBD_CM_RESPONDER_RESOURCES ?
+   info->id->device->attrs.max_qp_rd_atom :
+   SMBD_CM_RESPONDER_RESOURCES;
+   info->responder_resources = conn_param.responder_resources;
+   log_rdma_mr(INFO, "responder_resources=%d\n",
+   info->responder_resources);
+
+   /* Need to send IRD/ORD in private data for iWARP */
+   info->id->device->get_port_immutable(
+   info->id->device, info->id->port_num, _immutable);
+   if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
+   ird_ord_hdr[0] = info->responder_resources;
+   ird_ord_hdr[1] = 1;
+   conn_param.private_data = ird_ord_hdr;
+   conn_param.private_data_len = sizeof(ird_ord_hdr);
+   } else {
+   conn_param.private_data = NULL;
+   conn_param.private_data_len = 0;
+   }
+
conn_param.retry_count = SMBD_CM_RETRY;
conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
conn_param.flow_control = 0;
@@ -1740,8 +1789,19 @@ struct smbd_connection *_smbd_get_connection(
goto negotiation_failed;
}
 
+   rc = allocate_mr_list(info);
+   if (rc) {
+   log_rdma_mr(ERR, "memory registration allocation failed\n");
+   goto allocate_mr_failed;
+   }
+
return info;
 
+allocate_mr_failed:
+   /* At this point, need to a full transport shutdown */
+   smbd_destroy(info);
+   return NULL;
+
 negotiation_failed:
cancel_delayed_work_sync(>idle_timer_work);
destroy_caches_and_workqueue(info);
@@ 

[Patch v6 05/22] CIFS: SMBD: Establish SMB Direct connection

2017-11-04 Thread Long Li
From: Long Li 

Add code to implement the core functions to establish a SMB Direct
connection.

1. Establish an RDMA connection to SMB server.
2. Negotiate and setup SMB Direct protocol.
3. Implement idle connection timer and credit management.

SMB Direct is enabled by setting CONFIG_CIFS_SMB_DIRECT=y.

Add to Makefile to enable building SMB Direct.

Signed-off-by: Long Li 
---
 fs/cifs/Makefile|2 +
 fs/cifs/smbdirect.c | 1577 +++
 fs/cifs/smbdirect.h |  265 +
 3 files changed, 1844 insertions(+)

diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 5e853a3..ad00873 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -18,3 +18,5 @@ cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
 cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
 
 cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
+
+cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index d3c16f8..7af49cd 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -13,7 +13,34 @@
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  */
+#include 
 #include "smbdirect.h"
+#include "cifs_debug.h"
+
+static struct smbd_response *get_empty_queue_buffer(
+   struct smbd_connection *info);
+static struct smbd_response *get_receive_buffer(
+   struct smbd_connection *info);
+static void put_receive_buffer(
+   struct smbd_connection *info,
+   struct smbd_response *response,
+   bool lock);
+static int allocate_receive_buffers(struct smbd_connection *info, int num_buf);
+static void destroy_receive_buffers(struct smbd_connection *info);
+
+static void put_empty_packet(
+   struct smbd_connection *info, struct smbd_response *response);
+static void enqueue_reassembly(
+   struct smbd_connection *info,
+   struct smbd_response *response, int data_length);
+static struct smbd_response *_get_first_reassembly(
+   struct smbd_connection *info);
+
+static int smbd_post_recv(
+   struct smbd_connection *info,
+   struct smbd_response *response);
+
+static int smbd_post_send_empty(struct smbd_connection *info);
 
 /* SMBD version number */
 #define SMBD_V10x0100
@@ -75,3 +102,1553 @@ int smbd_max_frmr_depth = 2048;
 
 /* If payload is less than this byte, use RDMA send/recv not read/write */
 int rdma_readwrite_threshold = 4096;
+
+/* Transport logging functions
+ * Logging are defined as classes. They can be OR'ed to define the actual
+ * logging level via module parameter smbd_logging_class
+ * e.g. cifs.smbd_logging_class=0x500 will log all log_rdma_recv() and
+ * log_rdma_event()
+ */
+#define LOG_OUTGOING   0x1
+#define LOG_INCOMING   0x2
+#define LOG_READ   0x4
+#define LOG_WRITE  0x8
+#define LOG_RDMA_SEND  0x10
+#define LOG_RDMA_RECV  0x20
+#define LOG_KEEP_ALIVE 0x40
+#define LOG_RDMA_EVENT 0x80
+#define LOG_RDMA_MR0x100
+static unsigned int smbd_logging_class;
+module_param(smbd_logging_class, uint, 0644);
+MODULE_PARM_DESC(smbd_logging_class,
+   "Logging class for SMBD transport 0x0 to 0x100");
+
+#define ERR0x0
+#define INFO   0x1
+static unsigned int smbd_logging_level;
+module_param(smbd_logging_level, uint, 0644);
+MODULE_PARM_DESC(smbd_logging_level,
+   "Logging level for SMBD transport, 0 (default): error, 1: info");
+
+#define log_rdma(level, class, fmt, args...)   \
+do {   \
+   if (level <= smbd_logging_level || class & smbd_logging_class)  \
+   cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
+} while (0)
+
+#define log_outgoing(level, fmt, args...) \
+   log_rdma(level, LOG_OUTGOING, fmt, ##args)
+#define log_incoming(level, fmt, args...) \
+   log_rdma(level, LOG_INCOMING, fmt, ##args)
+#define log_read(level, fmt, args...)  log_rdma(level, LOG_READ, fmt, ##args)
+#define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args)
+#define log_rdma_send(level, fmt, args...) \
+   log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
+#define log_rdma_recv(level, fmt, args...) \
+   log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
+#define log_keep_alive(level, fmt, args...) \
+   log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
+#define log_rdma_event(level, fmt, args...) \
+   log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
+#define log_rdma_mr(level, fmt, args...) \
+   log_rdma(level, LOG_RDMA_MR, fmt, ##args)
+
+/*
+ * Destroy the transport and related RDMA and memory resources
+ * Need to go through all the pending counters and make sure on 

[Patch v6 20/22] CIFS: SMBD: Read correct returned data length for RDMA write (SMB read) I/O

2017-11-04 Thread Long Li
From: Long Li 

This patch is for preparing upper layer doing SMB read via RDMA write.

When RDMA write is used for SMB read, the returned data length is in
DataRemaining in the response packet. Reading it properly by adding a
parameter to specifiy where the returned data length is.

Add the defition for memory registration to wdata and return the correct
length based on if RDMA write is used.

Signed-off-by: Long Li 
---
 fs/cifs/cifsglob.h | 13 +++--
 fs/cifs/cifssmb.c  |  7 ++-
 fs/cifs/smb1ops.c  |  4 +++-
 fs/cifs/smb2ops.c  | 12 ++--
 4 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 2ae7d02..ddf83d8 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -228,8 +228,14 @@ struct smb_version_operations {
__u64 (*get_next_mid)(struct TCP_Server_Info *);
/* data offset from read response message */
unsigned int (*read_data_offset)(char *);
-   /* data length from read response message */
-   unsigned int (*read_data_length)(char *);
+   /*
+* Data length from read response message
+* When in_remaining is true, the returned data length is in
+* message field DataRemaining for out-of-band data read (e.g through
+* Memory Registration RDMA write in SMBD).
+* Otherwise, the returned data length is in message field DataLength.
+*/
+   unsigned int (*read_data_length)(char *, bool in_remaining);
/* map smb to linux error */
int (*map_error)(char *, bool);
/* find mid corresponding to the response message */
@@ -1148,6 +1154,9 @@ struct cifs_readdata {
struct cifs_readdata *rdata,
struct iov_iter *iter);
struct kvec iov[2];
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   struct smbd_mr  *mr;
+#endif
unsigned intpagesz;
unsigned inttailsz;
unsigned intcredits;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 08ff56a..a343db4 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1533,8 +1533,13 @@ cifs_readv_receive(struct TCP_Server_Info *server, 
struct mid_q_entry *mid)
 rdata->iov[0].iov_base, server->total_read);
 
/* how much data is in the response? */
-   data_len = server->ops->read_data_length(buf);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   data_len = server->ops->read_data_length(buf, rdata->mr);
+   if (!rdata->mr && (data_offset + data_len > buflen)) {
+#else
+   data_len = server->ops->read_data_length(buf, false);
if (data_offset + data_len > buflen) {
+#endif
/* data_len is corrupt -- discard frame */
rdata->result = -EIO;
return cifs_readv_discard(server, mid);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index a723df3..3d495e4 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -87,9 +87,11 @@ cifs_read_data_offset(char *buf)
 }
 
 static unsigned int
-cifs_read_data_length(char *buf)
+cifs_read_data_length(char *buf, bool in_remaining)
 {
READ_RSP *rsp = (READ_RSP *)buf;
+   /* It's a bug reading remaining data for SMB1 packets */
+   WARN_ON(in_remaining);
return (le16_to_cpu(rsp->DataLengthHigh) << 16) +
   le16_to_cpu(rsp->DataLength);
 }
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 25028da..1bbd248 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -938,9 +938,13 @@ smb2_read_data_offset(char *buf)
 }
 
 static unsigned int
-smb2_read_data_length(char *buf)
+smb2_read_data_length(char *buf, bool in_remaining)
 {
struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf;
+
+   if (in_remaining)
+   return le32_to_cpu(rsp->DataRemaining);
+
return le32_to_cpu(rsp->DataLength);
 }
 
@@ -2449,7 +2453,11 @@ handle_read_data(struct TCP_Server_Info *server, struct 
mid_q_entry *mid,
}
 
data_offset = server->ops->read_data_offset(buf) + 4;
-   data_len = server->ops->read_data_length(buf);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   data_len = server->ops->read_data_length(buf, rdata->mr);
+#else
+   data_len = server->ops->read_data_length(buf, false);
+#endif
 
if (data_offset < server->vals->read_rsp_size) {
/*
-- 
2.7.4



[Patch v6 06/22] CIFS: SMBD: export protocol initial values

2017-11-04 Thread Long Li
From: Long Li 

For use-configurable SMB Direct protocol values, export them to
/proc/fs/cifs.

Signed-off-by: Long Li 
---
 fs/cifs/cifs_debug.c | 79 
 1 file changed, 79 insertions(+)

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index ba0870d..7025d8d 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -30,6 +30,9 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifsfs.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 void
 cifs_dump_mem(char *label, void *data, int length)
@@ -371,6 +374,54 @@ static const struct file_operations cifs_stats_proc_fops = 
{
 };
 #endif /* STATS */
 
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#define PROC_FILE_DEFINE(name) \
+static ssize_t name##_write(struct file *file, const char __user *buffer, \
+   size_t count, loff_t *ppos) \
+{ \
+   int rc; \
+   rc = kstrtoint_from_user(buffer, count, 10, & name ); \
+   if (rc) \
+   return rc; \
+   return count; \
+} \
+static int name##_proc_show(struct seq_file *m, void *v) \
+{ \
+   seq_printf(m, "%d\n", name ); \
+   return 0; \
+} \
+static int name##_open(struct inode *inode, struct file *file) \
+{ \
+   return single_open(file, name##_proc_show, NULL); \
+} \
+\
+static const struct file_operations cifs_##name##_proc_fops = { \
+   .open   = name##_open, \
+   .read   = seq_read, \
+   .llseek = seq_lseek, \
+   .release= single_release, \
+   .write  = name##_write, \
+}
+
+extern int rdma_readwrite_threshold;
+extern int smbd_max_frmr_depth;
+extern int smbd_keep_alive_interval;
+extern int smbd_max_receive_size;
+extern int smbd_max_fragmented_recv_size;
+extern int smbd_max_send_size;
+extern int smbd_send_credit_target;
+extern int smbd_receive_credit_max;
+
+PROC_FILE_DEFINE(rdma_readwrite_threshold);
+PROC_FILE_DEFINE(smbd_max_frmr_depth);
+PROC_FILE_DEFINE(smbd_keep_alive_interval);
+PROC_FILE_DEFINE(smbd_max_receive_size);
+PROC_FILE_DEFINE(smbd_max_fragmented_recv_size);
+PROC_FILE_DEFINE(smbd_max_send_size);
+PROC_FILE_DEFINE(smbd_send_credit_target);
+PROC_FILE_DEFINE(smbd_receive_credit_max);
+#endif
+
 static struct proc_dir_entry *proc_fs_cifs;
 static const struct file_operations cifsFYI_proc_fops;
 static const struct file_operations cifs_lookup_cache_proc_fops;
@@ -398,6 +449,24 @@ cifs_proc_init(void)
_security_flags_proc_fops);
proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
_lookup_cache_proc_fops);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   proc_create("rdma_readwrite_threshold", 0, proc_fs_cifs,
+   _rdma_readwrite_threshold_proc_fops);
+   proc_create("smbd_max_frmr_depth", 0, proc_fs_cifs,
+   _smbd_max_frmr_depth_proc_fops);
+   proc_create("smbd_keep_alive_interval", 0, proc_fs_cifs,
+   _smbd_keep_alive_interval_proc_fops);
+   proc_create("smbd_max_receive_size", 0, proc_fs_cifs,
+   _smbd_max_receive_size_proc_fops);
+   proc_create("smbd_max_fragmented_recv_size", 0, proc_fs_cifs,
+   _smbd_max_fragmented_recv_size_proc_fops);
+   proc_create("smbd_max_send_size", 0, proc_fs_cifs,
+   _smbd_max_send_size_proc_fops);
+   proc_create("smbd_send_credit_target", 0, proc_fs_cifs,
+   _smbd_send_credit_target_proc_fops);
+   proc_create("smbd_receive_credit_max", 0, proc_fs_cifs,
+   _smbd_receive_credit_max_proc_fops);
+#endif
 }
 
 void
@@ -415,6 +484,16 @@ cifs_proc_clean(void)
remove_proc_entry("SecurityFlags", proc_fs_cifs);
remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   remove_proc_entry("rdma_readwrite_threshold", proc_fs_cifs);
+   remove_proc_entry("smbd_max_frmr_depth", proc_fs_cifs);
+   remove_proc_entry("smbd_keep_alive_interval", proc_fs_cifs);
+   remove_proc_entry("smbd_max_receive_size", proc_fs_cifs);
+   remove_proc_entry("smbd_max_fragmented_recv_size", proc_fs_cifs);
+   remove_proc_entry("smbd_max_send_size", proc_fs_cifs);
+   remove_proc_entry("smbd_send_credit_target", proc_fs_cifs);
+   remove_proc_entry("smbd_receive_credit_max", proc_fs_cifs);
+#endif
remove_proc_entry("fs/cifs", NULL);
 }
 
-- 
2.7.4



[Patch v6 20/22] CIFS: SMBD: Read correct returned data length for RDMA write (SMB read) I/O

2017-11-04 Thread Long Li
From: Long Li 

This patch is for preparing upper layer doing SMB read via RDMA write.

When RDMA write is used for SMB read, the returned data length is in
DataRemaining in the response packet. Reading it properly by adding a
parameter to specifiy where the returned data length is.

Add the defition for memory registration to wdata and return the correct
length based on if RDMA write is used.

Signed-off-by: Long Li 
---
 fs/cifs/cifsglob.h | 13 +++--
 fs/cifs/cifssmb.c  |  7 ++-
 fs/cifs/smb1ops.c  |  4 +++-
 fs/cifs/smb2ops.c  | 12 ++--
 4 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 2ae7d02..ddf83d8 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -228,8 +228,14 @@ struct smb_version_operations {
__u64 (*get_next_mid)(struct TCP_Server_Info *);
/* data offset from read response message */
unsigned int (*read_data_offset)(char *);
-   /* data length from read response message */
-   unsigned int (*read_data_length)(char *);
+   /*
+* Data length from read response message
+* When in_remaining is true, the returned data length is in
+* message field DataRemaining for out-of-band data read (e.g through
+* Memory Registration RDMA write in SMBD).
+* Otherwise, the returned data length is in message field DataLength.
+*/
+   unsigned int (*read_data_length)(char *, bool in_remaining);
/* map smb to linux error */
int (*map_error)(char *, bool);
/* find mid corresponding to the response message */
@@ -1148,6 +1154,9 @@ struct cifs_readdata {
struct cifs_readdata *rdata,
struct iov_iter *iter);
struct kvec iov[2];
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   struct smbd_mr  *mr;
+#endif
unsigned intpagesz;
unsigned inttailsz;
unsigned intcredits;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 08ff56a..a343db4 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1533,8 +1533,13 @@ cifs_readv_receive(struct TCP_Server_Info *server, 
struct mid_q_entry *mid)
 rdata->iov[0].iov_base, server->total_read);
 
/* how much data is in the response? */
-   data_len = server->ops->read_data_length(buf);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   data_len = server->ops->read_data_length(buf, rdata->mr);
+   if (!rdata->mr && (data_offset + data_len > buflen)) {
+#else
+   data_len = server->ops->read_data_length(buf, false);
if (data_offset + data_len > buflen) {
+#endif
/* data_len is corrupt -- discard frame */
rdata->result = -EIO;
return cifs_readv_discard(server, mid);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index a723df3..3d495e4 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -87,9 +87,11 @@ cifs_read_data_offset(char *buf)
 }
 
 static unsigned int
-cifs_read_data_length(char *buf)
+cifs_read_data_length(char *buf, bool in_remaining)
 {
READ_RSP *rsp = (READ_RSP *)buf;
+   /* It's a bug reading remaining data for SMB1 packets */
+   WARN_ON(in_remaining);
return (le16_to_cpu(rsp->DataLengthHigh) << 16) +
   le16_to_cpu(rsp->DataLength);
 }
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 25028da..1bbd248 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -938,9 +938,13 @@ smb2_read_data_offset(char *buf)
 }
 
 static unsigned int
-smb2_read_data_length(char *buf)
+smb2_read_data_length(char *buf, bool in_remaining)
 {
struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf;
+
+   if (in_remaining)
+   return le32_to_cpu(rsp->DataRemaining);
+
return le32_to_cpu(rsp->DataLength);
 }
 
@@ -2449,7 +2453,11 @@ handle_read_data(struct TCP_Server_Info *server, struct 
mid_q_entry *mid,
}
 
data_offset = server->ops->read_data_offset(buf) + 4;
-   data_len = server->ops->read_data_length(buf);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   data_len = server->ops->read_data_length(buf, rdata->mr);
+#else
+   data_len = server->ops->read_data_length(buf, false);
+#endif
 
if (data_offset < server->vals->read_rsp_size) {
/*
-- 
2.7.4



[Patch v6 06/22] CIFS: SMBD: export protocol initial values

2017-11-04 Thread Long Li
From: Long Li 

For use-configurable SMB Direct protocol values, export them to
/proc/fs/cifs.

Signed-off-by: Long Li 
---
 fs/cifs/cifs_debug.c | 79 
 1 file changed, 79 insertions(+)

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index ba0870d..7025d8d 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -30,6 +30,9 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifsfs.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 void
 cifs_dump_mem(char *label, void *data, int length)
@@ -371,6 +374,54 @@ static const struct file_operations cifs_stats_proc_fops = 
{
 };
 #endif /* STATS */
 
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#define PROC_FILE_DEFINE(name) \
+static ssize_t name##_write(struct file *file, const char __user *buffer, \
+   size_t count, loff_t *ppos) \
+{ \
+   int rc; \
+   rc = kstrtoint_from_user(buffer, count, 10, & name ); \
+   if (rc) \
+   return rc; \
+   return count; \
+} \
+static int name##_proc_show(struct seq_file *m, void *v) \
+{ \
+   seq_printf(m, "%d\n", name ); \
+   return 0; \
+} \
+static int name##_open(struct inode *inode, struct file *file) \
+{ \
+   return single_open(file, name##_proc_show, NULL); \
+} \
+\
+static const struct file_operations cifs_##name##_proc_fops = { \
+   .open   = name##_open, \
+   .read   = seq_read, \
+   .llseek = seq_lseek, \
+   .release= single_release, \
+   .write  = name##_write, \
+}
+
+extern int rdma_readwrite_threshold;
+extern int smbd_max_frmr_depth;
+extern int smbd_keep_alive_interval;
+extern int smbd_max_receive_size;
+extern int smbd_max_fragmented_recv_size;
+extern int smbd_max_send_size;
+extern int smbd_send_credit_target;
+extern int smbd_receive_credit_max;
+
+PROC_FILE_DEFINE(rdma_readwrite_threshold);
+PROC_FILE_DEFINE(smbd_max_frmr_depth);
+PROC_FILE_DEFINE(smbd_keep_alive_interval);
+PROC_FILE_DEFINE(smbd_max_receive_size);
+PROC_FILE_DEFINE(smbd_max_fragmented_recv_size);
+PROC_FILE_DEFINE(smbd_max_send_size);
+PROC_FILE_DEFINE(smbd_send_credit_target);
+PROC_FILE_DEFINE(smbd_receive_credit_max);
+#endif
+
 static struct proc_dir_entry *proc_fs_cifs;
 static const struct file_operations cifsFYI_proc_fops;
 static const struct file_operations cifs_lookup_cache_proc_fops;
@@ -398,6 +449,24 @@ cifs_proc_init(void)
_security_flags_proc_fops);
proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
_lookup_cache_proc_fops);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   proc_create("rdma_readwrite_threshold", 0, proc_fs_cifs,
+   _rdma_readwrite_threshold_proc_fops);
+   proc_create("smbd_max_frmr_depth", 0, proc_fs_cifs,
+   _smbd_max_frmr_depth_proc_fops);
+   proc_create("smbd_keep_alive_interval", 0, proc_fs_cifs,
+   _smbd_keep_alive_interval_proc_fops);
+   proc_create("smbd_max_receive_size", 0, proc_fs_cifs,
+   _smbd_max_receive_size_proc_fops);
+   proc_create("smbd_max_fragmented_recv_size", 0, proc_fs_cifs,
+   _smbd_max_fragmented_recv_size_proc_fops);
+   proc_create("smbd_max_send_size", 0, proc_fs_cifs,
+   _smbd_max_send_size_proc_fops);
+   proc_create("smbd_send_credit_target", 0, proc_fs_cifs,
+   _smbd_send_credit_target_proc_fops);
+   proc_create("smbd_receive_credit_max", 0, proc_fs_cifs,
+   _smbd_receive_credit_max_proc_fops);
+#endif
 }
 
 void
@@ -415,6 +484,16 @@ cifs_proc_clean(void)
remove_proc_entry("SecurityFlags", proc_fs_cifs);
remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   remove_proc_entry("rdma_readwrite_threshold", proc_fs_cifs);
+   remove_proc_entry("smbd_max_frmr_depth", proc_fs_cifs);
+   remove_proc_entry("smbd_keep_alive_interval", proc_fs_cifs);
+   remove_proc_entry("smbd_max_receive_size", proc_fs_cifs);
+   remove_proc_entry("smbd_max_fragmented_recv_size", proc_fs_cifs);
+   remove_proc_entry("smbd_max_send_size", proc_fs_cifs);
+   remove_proc_entry("smbd_send_credit_target", proc_fs_cifs);
+   remove_proc_entry("smbd_receive_credit_max", proc_fs_cifs);
+#endif
remove_proc_entry("fs/cifs", NULL);
 }
 
-- 
2.7.4



[Patch v6 02/22] CIFS: SMBD: Introduce kernel config option CONFIG_CIFS_SMB_DIRECT

2017-11-04 Thread Long Li
From: Long Li 

SMB Direct is a protocol for transferring SMB packets over RDMA. It was
introduced with Windows Serer 2012 and SMB 3.0.

With CONFIG_CIFS_SMB_DIRECT=y, SMB Direct code is built as part of CIFS.

Signed-off-by: Long Li 
---
 fs/cifs/Kconfig | 8 
 1 file changed, 8 insertions(+)

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f724361..8d05fff 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -191,6 +191,14 @@ config CIFS_SMB311
  This dialect includes improved security negotiation features.
  If unsure, say N
 
+config CIFS_SMB_DIRECT
+   bool "SMB Direct support (Experimental)"
+   depends on CIFS && INFINIBAND
+   help
+ Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1.
+ SMB Direct allows transferring SMB packets over RDMA. If unsure,
+ say N.
+
 config CIFS_FSCACHE
  bool "Provide CIFS client caching support"
  depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
-- 
2.7.4



[Patch v6 19/22] CIFS: SMBD: Upper layer performs SMB write via RDMA read through memory registration

2017-11-04 Thread Long Li
From: Long Li 

When sending I/O, if size is larger than rdma_readwrite_threshold we
prepare to send SMB write packet for a RDMA read via memory registration.
The actual I/O is done by remote peer through local RDMA hardware. Modify
the relevant fields in the packet accordingly, and append a
smbd_buffer_descriptor_v1 to the end of the SMB write packet.

On write I/O finish, deregister the memory region if this was for a RDMA
read. If remote invalidation is not used, the call to smbd_deregister_mr
will do local invalidation and possibly wait. Memory region is normally
deregistered in MID callback as soon as it's used. There are situations
where the MID may not be created on I/O failure, under which memory region
is deregistered when write data context is released.

Signed-off-by: Long Li 
---
 fs/cifs/cifsglob.h |  3 +++
 fs/cifs/cifssmb.c  |  9 
 fs/cifs/smb2pdu.c  | 68 ++
 3 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 09f9a71..2ae7d02 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1170,6 +1170,9 @@ struct cifs_writedata {
pid_t   pid;
unsigned intbytes;
int result;
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   struct smbd_mr  *mr;
+#endif
unsigned intpagesz;
unsigned inttailsz;
unsigned intcredits;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5857009..08ff56a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -43,6 +43,9 @@
 #include "cifs_unicode.h"
 #include "cifs_debug.h"
 #include "fscache.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 #ifdef CONFIG_CIFS_POSIX
 static struct {
@@ -1911,6 +1914,12 @@ cifs_writedata_release(struct kref *refcount)
 {
struct cifs_writedata *wdata = container_of(refcount,
struct cifs_writedata, refcount);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (wdata->mr) {
+   smbd_deregister_mr(wdata->mr);
+   wdata->mr = NULL;
+   }
+#endif
 
if (wdata->cfile)
cifsFileInfo_put(wdata->cfile);
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 32ad590..8ef4a2f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -48,6 +48,9 @@
 #include "smb2glob.h"
 #include "cifspdu.h"
 #include "cifs_spnego.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 /*
  *  The following table defines the expected "StructureSize" of SMB2 requests
@@ -2656,7 +2659,19 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->result = -EIO;
break;
}
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   /*
+* If this wdata has a memory registered, the MR can be freed
+* The number of MRs available is limited, it's important to recover
+* used MR as soon as I/O is finished. Hold MR longer in the later
+* I/O process can possibly result in I/O deadlock due to lack of MR
+* to send request on I/O retry
+*/
+   if (wdata->mr) {
+   smbd_deregister_mr(wdata->mr);
+   wdata->mr = NULL;
+   }
+#endif
if (wdata->result)
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
 
@@ -2707,7 +2722,42 @@ smb2_async_writev(struct cifs_writedata *wdata,
req->DataOffset = cpu_to_le16(
offsetof(struct smb2_write_req, Buffer) - 4);
req->RemainingBytes = 0;
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   /*
+* If we want to do a server RDMA read, fill in and append
+* smbd_buffer_descriptor_v1 to the end of write request
+*/
+   if (server->rdma && wdata->bytes >=
+   server->smbd_conn->rdma_readwrite_threshold) {
+
+   struct smbd_buffer_descriptor_v1 *v1;
+   bool need_invalidate = server->dialect == SMB30_PROT_ID;
+
+   wdata->mr = smbd_register_mr(
+   server->smbd_conn, wdata->pages,
+   wdata->nr_pages, wdata->tailsz,
+   false, need_invalidate);
+   if (!wdata->mr) {
+   rc = -ENOBUFS;
+   goto async_writev_out;
+   }
+   req->Length = 0;
+   req->DataOffset = 0;
+   req->RemainingBytes =
+   (wdata->nr_pages-1)*PAGE_SIZE + wdata->tailsz;
+   req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
+   if (need_invalidate)
+   req->Channel = SMB2_CHANNEL_RDMA_V1;
+   req->WriteChannelInfoOffset =
+   offsetof(struct smb2_write_req, Buffer) - 4;
+   req->WriteChannelInfoLength =
+ 

[Patch v6 02/22] CIFS: SMBD: Introduce kernel config option CONFIG_CIFS_SMB_DIRECT

2017-11-04 Thread Long Li
From: Long Li 

SMB Direct is a protocol for transferring SMB packets over RDMA. It was
introduced with Windows Serer 2012 and SMB 3.0.

With CONFIG_CIFS_SMB_DIRECT=y, SMB Direct code is built as part of CIFS.

Signed-off-by: Long Li 
---
 fs/cifs/Kconfig | 8 
 1 file changed, 8 insertions(+)

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f724361..8d05fff 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -191,6 +191,14 @@ config CIFS_SMB311
  This dialect includes improved security negotiation features.
  If unsure, say N
 
+config CIFS_SMB_DIRECT
+   bool "SMB Direct support (Experimental)"
+   depends on CIFS && INFINIBAND
+   help
+ Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1.
+ SMB Direct allows transferring SMB packets over RDMA. If unsure,
+ say N.
+
 config CIFS_FSCACHE
  bool "Provide CIFS client caching support"
  depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
-- 
2.7.4



[Patch v6 19/22] CIFS: SMBD: Upper layer performs SMB write via RDMA read through memory registration

2017-11-04 Thread Long Li
From: Long Li 

When sending I/O, if size is larger than rdma_readwrite_threshold we
prepare to send SMB write packet for a RDMA read via memory registration.
The actual I/O is done by remote peer through local RDMA hardware. Modify
the relevant fields in the packet accordingly, and append a
smbd_buffer_descriptor_v1 to the end of the SMB write packet.

On write I/O finish, deregister the memory region if this was for a RDMA
read. If remote invalidation is not used, the call to smbd_deregister_mr
will do local invalidation and possibly wait. Memory region is normally
deregistered in MID callback as soon as it's used. There are situations
where the MID may not be created on I/O failure, under which memory region
is deregistered when write data context is released.

Signed-off-by: Long Li 
---
 fs/cifs/cifsglob.h |  3 +++
 fs/cifs/cifssmb.c  |  9 
 fs/cifs/smb2pdu.c  | 68 ++
 3 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 09f9a71..2ae7d02 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1170,6 +1170,9 @@ struct cifs_writedata {
pid_t   pid;
unsigned intbytes;
int result;
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   struct smbd_mr  *mr;
+#endif
unsigned intpagesz;
unsigned inttailsz;
unsigned intcredits;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5857009..08ff56a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -43,6 +43,9 @@
 #include "cifs_unicode.h"
 #include "cifs_debug.h"
 #include "fscache.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 #ifdef CONFIG_CIFS_POSIX
 static struct {
@@ -1911,6 +1914,12 @@ cifs_writedata_release(struct kref *refcount)
 {
struct cifs_writedata *wdata = container_of(refcount,
struct cifs_writedata, refcount);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (wdata->mr) {
+   smbd_deregister_mr(wdata->mr);
+   wdata->mr = NULL;
+   }
+#endif
 
if (wdata->cfile)
cifsFileInfo_put(wdata->cfile);
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 32ad590..8ef4a2f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -48,6 +48,9 @@
 #include "smb2glob.h"
 #include "cifspdu.h"
 #include "cifs_spnego.h"
+#ifdef CONFIG_CIFS_SMB_DIRECT
+#include "smbdirect.h"
+#endif
 
 /*
  *  The following table defines the expected "StructureSize" of SMB2 requests
@@ -2656,7 +2659,19 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->result = -EIO;
break;
}
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   /*
+* If this wdata has a memory registered, the MR can be freed
+* The number of MRs available is limited, it's important to recover
+* used MR as soon as I/O is finished. Hold MR longer in the later
+* I/O process can possibly result in I/O deadlock due to lack of MR
+* to send request on I/O retry
+*/
+   if (wdata->mr) {
+   smbd_deregister_mr(wdata->mr);
+   wdata->mr = NULL;
+   }
+#endif
if (wdata->result)
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
 
@@ -2707,7 +2722,42 @@ smb2_async_writev(struct cifs_writedata *wdata,
req->DataOffset = cpu_to_le16(
offsetof(struct smb2_write_req, Buffer) - 4);
req->RemainingBytes = 0;
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   /*
+* If we want to do a server RDMA read, fill in and append
+* smbd_buffer_descriptor_v1 to the end of write request
+*/
+   if (server->rdma && wdata->bytes >=
+   server->smbd_conn->rdma_readwrite_threshold) {
+
+   struct smbd_buffer_descriptor_v1 *v1;
+   bool need_invalidate = server->dialect == SMB30_PROT_ID;
+
+   wdata->mr = smbd_register_mr(
+   server->smbd_conn, wdata->pages,
+   wdata->nr_pages, wdata->tailsz,
+   false, need_invalidate);
+   if (!wdata->mr) {
+   rc = -ENOBUFS;
+   goto async_writev_out;
+   }
+   req->Length = 0;
+   req->DataOffset = 0;
+   req->RemainingBytes =
+   (wdata->nr_pages-1)*PAGE_SIZE + wdata->tailsz;
+   req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
+   if (need_invalidate)
+   req->Channel = SMB2_CHANNEL_RDMA_V1;
+   req->WriteChannelInfoOffset =
+   offsetof(struct smb2_write_req, Buffer) - 4;
+   req->WriteChannelInfoLength =
+   sizeof(struct 

[Patch v6 10/22] CIFS: SMBD: Upper layer reconnects to SMB Direct session

2017-11-04 Thread Long Li
From: Long Li 

Do a reconnect on SMB Direct when it is used as the connection. Reconnect
can happen for many reasons and it's mostly the decision of SMB2 upper
layer.

Signed-off-by: Long Li 
---
 fs/cifs/connect.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2c0b34a..8ca3c13 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -406,7 +406,14 @@ cifs_reconnect(struct TCP_Server_Info *server)
 
/* we should try only the port we connected to before */
mutex_lock(>srv_mutex);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (server->rdma)
+   rc = smbd_reconnect(server);
+   else
+   rc = generic_ip_connect(server);
+#else
rc = generic_ip_connect(server);
+#endif
if (rc) {
cifs_dbg(FYI, "reconnect error %d\n", rc);
mutex_unlock(>srv_mutex);
-- 
2.7.4



[Patch v6 14/22] CIFS: SMBD: Implement function to receive data via RDMA receive

2017-11-04 Thread Long Li
From: Long Li 

On the receive path, the transport maintains receive buffers and a
reassembly queue for transferring payload via RDMA recv. There is data
copy in the transport on recv when it copies the payload to upper layer.

The transport recognizes the RFC1002 header length use in the SMB upper
layer payloads in CIFS. Because this length is mainly used for TCP and
not applicable to RDMA, it is handled as a out-of-band information and is
never sent over the wire, and the trasnport behaves like TCP to upper layer
by processing and exposing the length correctly on data payloads.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 228 
 fs/cifs/smbdirect.h |   3 +
 2 files changed, 231 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5952276..1e7f5df 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -14,6 +14,7 @@
  *   the GNU General Public License for more details.
  */
 #include 
+#include 
 #include "smbdirect.h"
 #include "cifs_debug.h"
 
@@ -179,6 +180,8 @@ static void smbd_destroy_rdma_work(struct work_struct *work)
 
log_rdma_event(INFO, "wait for all recv to finish\n");
wake_up_interruptible(>wait_reassembly_queue);
+   wait_event(info->wait_smbd_recv_pending,
+   info->smbd_recv_pending == 0);
 
log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
wait_event(info->wait_send_pending,
@@ -1655,6 +1658,9 @@ struct smbd_connection *_smbd_get_connection(
queue_delayed_work(info->workqueue, >idle_timer_work,
info->keep_alive_interval*HZ);
 
+   init_waitqueue_head(>wait_smbd_recv_pending);
+   info->smbd_recv_pending = 0;
+
init_waitqueue_head(>wait_send_pending);
atomic_set(>send_pending, 0);
 
@@ -1721,3 +1727,225 @@ struct smbd_connection *smbd_get_connection(
}
return ret;
 }
+
+/*
+ * Receive data from receive reassembly queue
+ * All the incoming data packets are placed in reassembly queue
+ * buf: the buffer to read data into
+ * size: the length of data to read
+ * return value: actual data read
+ * Note: this implementation copies the data from reassebmly queue to receive
+ * buffers used by upper layer. This is not the optimal code path. A better way
+ * to do it is to not have upper layer allocate its receive buffers but rather
+ * borrow the buffer from reassembly queue, and return it after data is
+ * consumed. But this will require more changes to upper layer code, and also
+ * need to consider packet boundaries while they still being reassembled.
+ */
+int smbd_recv_buf(struct smbd_connection *info, char *buf, unsigned int size)
+{
+   struct smbd_response *response;
+   struct smbd_data_transfer *data_transfer;
+   int to_copy, to_read, data_read, offset;
+   u32 data_length, remaining_data_length, data_offset;
+   int rc;
+   unsigned long flags;
+
+again:
+   if (info->transport_status != SMBD_CONNECTED) {
+   log_read(ERR, "disconnected\n");
+   return -ENODEV;
+   }
+
+   /*
+* No need to hold the reassembly queue lock all the time as we are
+* the only one reading from the front of the queue. The transport
+* may add more entries to the back of the queeu at the same time
+*/
+   log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size,
+   info->reassembly_data_length);
+   if (info->reassembly_data_length >= size) {
+   int queue_length;
+   int queue_removed = 0;
+
+   /*
+* Need to make sure reassembly_data_length is read before
+* reading reassembly_queue_length and calling
+* _get_first_reassembly. This call is lock free
+* as we never read at the end of the queue which are being
+* updated in SOFTIRQ as more data is received
+*/
+   virt_rmb();
+   queue_length = info->reassembly_queue_length;
+   data_read = 0;
+   to_read = size;
+   offset = info->first_entry_offset;
+   while (data_read < size) {
+   response = _get_first_reassembly(info);
+   data_transfer = smbd_response_payload(response);
+   data_length = le32_to_cpu(data_transfer->data_length);
+   remaining_data_length =
+   le32_to_cpu(
+   data_transfer->remaining_data_length);
+   data_offset = le32_to_cpu(data_transfer->data_offset);
+
+   /*
+* The upper layer expects RFC1002 length at the
+* beginning of the payload. Return it to indicate
+* the total length of the packet. This 

[Patch v6 10/22] CIFS: SMBD: Upper layer reconnects to SMB Direct session

2017-11-04 Thread Long Li
From: Long Li 

Do a reconnect on SMB Direct when it is used as the connection. Reconnect
can happen for many reasons and it's mostly the decision of SMB2 upper
layer.

Signed-off-by: Long Li 
---
 fs/cifs/connect.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2c0b34a..8ca3c13 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -406,7 +406,14 @@ cifs_reconnect(struct TCP_Server_Info *server)
 
/* we should try only the port we connected to before */
mutex_lock(>srv_mutex);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+   if (server->rdma)
+   rc = smbd_reconnect(server);
+   else
+   rc = generic_ip_connect(server);
+#else
rc = generic_ip_connect(server);
+#endif
if (rc) {
cifs_dbg(FYI, "reconnect error %d\n", rc);
mutex_unlock(>srv_mutex);
-- 
2.7.4



[Patch v6 14/22] CIFS: SMBD: Implement function to receive data via RDMA receive

2017-11-04 Thread Long Li
From: Long Li 

On the receive path, the transport maintains receive buffers and a
reassembly queue for transferring payload via RDMA recv. There is data
copy in the transport on recv when it copies the payload to upper layer.

The transport recognizes the RFC1002 header length use in the SMB upper
layer payloads in CIFS. Because this length is mainly used for TCP and
not applicable to RDMA, it is handled as a out-of-band information and is
never sent over the wire, and the trasnport behaves like TCP to upper layer
by processing and exposing the length correctly on data payloads.

Signed-off-by: Long Li 
---
 fs/cifs/smbdirect.c | 228 
 fs/cifs/smbdirect.h |   3 +
 2 files changed, 231 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5952276..1e7f5df 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -14,6 +14,7 @@
  *   the GNU General Public License for more details.
  */
 #include 
+#include 
 #include "smbdirect.h"
 #include "cifs_debug.h"
 
@@ -179,6 +180,8 @@ static void smbd_destroy_rdma_work(struct work_struct *work)
 
log_rdma_event(INFO, "wait for all recv to finish\n");
wake_up_interruptible(>wait_reassembly_queue);
+   wait_event(info->wait_smbd_recv_pending,
+   info->smbd_recv_pending == 0);
 
log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
wait_event(info->wait_send_pending,
@@ -1655,6 +1658,9 @@ struct smbd_connection *_smbd_get_connection(
queue_delayed_work(info->workqueue, >idle_timer_work,
info->keep_alive_interval*HZ);
 
+   init_waitqueue_head(>wait_smbd_recv_pending);
+   info->smbd_recv_pending = 0;
+
init_waitqueue_head(>wait_send_pending);
atomic_set(>send_pending, 0);
 
@@ -1721,3 +1727,225 @@ struct smbd_connection *smbd_get_connection(
}
return ret;
 }
+
+/*
+ * Receive data from receive reassembly queue
+ * All the incoming data packets are placed in reassembly queue
+ * buf: the buffer to read data into
+ * size: the length of data to read
+ * return value: actual data read
+ * Note: this implementation copies the data from reassebmly queue to receive
+ * buffers used by upper layer. This is not the optimal code path. A better way
+ * to do it is to not have upper layer allocate its receive buffers but rather
+ * borrow the buffer from reassembly queue, and return it after data is
+ * consumed. But this will require more changes to upper layer code, and also
+ * need to consider packet boundaries while they still being reassembled.
+ */
+int smbd_recv_buf(struct smbd_connection *info, char *buf, unsigned int size)
+{
+   struct smbd_response *response;
+   struct smbd_data_transfer *data_transfer;
+   int to_copy, to_read, data_read, offset;
+   u32 data_length, remaining_data_length, data_offset;
+   int rc;
+   unsigned long flags;
+
+again:
+   if (info->transport_status != SMBD_CONNECTED) {
+   log_read(ERR, "disconnected\n");
+   return -ENODEV;
+   }
+
+   /*
+* No need to hold the reassembly queue lock all the time as we are
+* the only one reading from the front of the queue. The transport
+* may add more entries to the back of the queeu at the same time
+*/
+   log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size,
+   info->reassembly_data_length);
+   if (info->reassembly_data_length >= size) {
+   int queue_length;
+   int queue_removed = 0;
+
+   /*
+* Need to make sure reassembly_data_length is read before
+* reading reassembly_queue_length and calling
+* _get_first_reassembly. This call is lock free
+* as we never read at the end of the queue which are being
+* updated in SOFTIRQ as more data is received
+*/
+   virt_rmb();
+   queue_length = info->reassembly_queue_length;
+   data_read = 0;
+   to_read = size;
+   offset = info->first_entry_offset;
+   while (data_read < size) {
+   response = _get_first_reassembly(info);
+   data_transfer = smbd_response_payload(response);
+   data_length = le32_to_cpu(data_transfer->data_length);
+   remaining_data_length =
+   le32_to_cpu(
+   data_transfer->remaining_data_length);
+   data_offset = le32_to_cpu(data_transfer->data_offset);
+
+   /*
+* The upper layer expects RFC1002 length at the
+* beginning of the payload. Return it to indicate
+* the total length of the packet. This minimize the
+* change 

[PATCH 0/2] kbuild: move dtb-y and CONFIG_OF_ALL_DTBS to Kbuild core to fix some issues

2017-11-04 Thread Masahiro Yamada

This series applies on top of my previous cleanup patch:
https://patchwork.kernel.org/patch/10038167/

This series must go to the same branch, so I am sending this to DT ML.



Masahiro Yamada (2):
  MIPS: dts: remove bogus bcm96358nb4ser.dtb from dtb-y entry
  kbuild: handle dtb-y and CONFIG_OF_ALL_DTBS natively in Makefile.lib

 arch/arc/boot/dts/Makefile   |  7 ++-
 arch/arm/boot/dts/Makefile   |  5 --
 arch/arm64/boot/dts/Makefile | 58 ++--
 arch/arm64/boot/dts/actions/Makefile |  3 --
 arch/arm64/boot/dts/al/Makefile  |  3 --
 arch/arm64/boot/dts/allwinner/Makefile   |  3 --
 arch/arm64/boot/dts/altera/Makefile  |  3 --
 arch/arm64/boot/dts/amd/Makefile |  3 --
 arch/arm64/boot/dts/amlogic/Makefile |  3 --
 arch/arm64/boot/dts/apm/Makefile |  3 --
 arch/arm64/boot/dts/arm/Makefile |  3 --
 arch/arm64/boot/dts/broadcom/Makefile|  6 +--
 arch/arm64/boot/dts/broadcom/northstar2/Makefile |  3 --
 arch/arm64/boot/dts/broadcom/stingray/Makefile   |  3 --
 arch/arm64/boot/dts/cavium/Makefile  |  3 --
 arch/arm64/boot/dts/exynos/Makefile  |  3 --
 arch/arm64/boot/dts/freescale/Makefile   |  3 --
 arch/arm64/boot/dts/hisilicon/Makefile   |  3 --
 arch/arm64/boot/dts/lg/Makefile  |  3 --
 arch/arm64/boot/dts/marvell/Makefile |  3 --
 arch/arm64/boot/dts/mediatek/Makefile|  3 --
 arch/arm64/boot/dts/nvidia/Makefile  |  2 -
 arch/arm64/boot/dts/qcom/Makefile|  3 --
 arch/arm64/boot/dts/realtek/Makefile |  3 --
 arch/arm64/boot/dts/renesas/Makefile |  2 -
 arch/arm64/boot/dts/rockchip/Makefile|  3 --
 arch/arm64/boot/dts/socionext/Makefile   |  2 -
 arch/arm64/boot/dts/sprd/Makefile|  3 --
 arch/arm64/boot/dts/xilinx/Makefile  |  3 --
 arch/arm64/boot/dts/zte/Makefile |  3 --
 arch/h8300/boot/dts/Makefile |  5 --
 arch/metag/boot/dts/Makefile |  5 --
 arch/mips/boot/dts/Makefile  | 32 ++---
 arch/mips/boot/dts/brcm/Makefile |  3 --
 arch/mips/boot/dts/cavium-octeon/Makefile|  2 -
 arch/mips/boot/dts/img/Makefile  |  2 -
 arch/mips/boot/dts/ingenic/Makefile  |  2 -
 arch/mips/boot/dts/lantiq/Makefile   |  2 -
 arch/mips/boot/dts/mti/Makefile  |  2 -
 arch/mips/boot/dts/netlogic/Makefile |  2 -
 arch/mips/boot/dts/ni/Makefile   |  2 -
 arch/mips/boot/dts/pic32/Makefile|  2 -
 arch/mips/boot/dts/qca/Makefile  |  2 -
 arch/mips/boot/dts/ralink/Makefile   |  2 -
 arch/mips/boot/dts/xilfpga/Makefile  |  2 -
 arch/xtensa/boot/dts/Makefile|  7 ++-
 scripts/Makefile.dtbinst |  6 +--
 scripts/Makefile.lib |  5 ++
 48 files changed, 53 insertions(+), 183 deletions(-)

-- 
2.7.4



[PATCH 2/2] kbuild: handle dtb-y and CONFIG_OF_ALL_DTBS natively in Makefile.lib

2017-11-04 Thread Masahiro Yamada
If CONFIG_OF_ALL_DTBS is enabled, "make ARCH=arm64 dtbs" compiles each
DTB twice; one from arch/arm64/boot/dts/*/Makefile and the other from
the dtb-$(CONFIG_OF_ALL_DTBS) line in arch/arm64/boot/dts/Makefile.
It could be a race problem when building DTBS in parallel.

Another minor issue is CONFIG_OF_ALL_DTBS covers only *.dts in vendor
sub-directories, so this broke when Broadcom added one more hierarchy
in arch/arm64/boot/dts/broadcom//.

One idea to fix the issues in a clean way is to move DTB handling
to Kbuild core scripts.  Makefile.dtbinst already recognizes dtb-y
natively, so it should not hurt to do so.

Add $(dtb-y) to extra-y, and $(dtb-) as well if CONFIG_OF_ALL_DTBS is
enabled.  All clutter things in Makefiles go away.

As a bonus clean-up, I also removed dts-dirs.  Just use subdir-y
directly to traverse sub-directories.

Signed-off-by: Masahiro Yamada 
---

 arch/arc/boot/dts/Makefile   |  7 ++-
 arch/arm/boot/dts/Makefile   |  5 --
 arch/arm64/boot/dts/Makefile | 58 ++--
 arch/arm64/boot/dts/actions/Makefile |  3 --
 arch/arm64/boot/dts/al/Makefile  |  3 --
 arch/arm64/boot/dts/allwinner/Makefile   |  3 --
 arch/arm64/boot/dts/altera/Makefile  |  3 --
 arch/arm64/boot/dts/amd/Makefile |  3 --
 arch/arm64/boot/dts/amlogic/Makefile |  3 --
 arch/arm64/boot/dts/apm/Makefile |  3 --
 arch/arm64/boot/dts/arm/Makefile |  3 --
 arch/arm64/boot/dts/broadcom/Makefile|  6 +--
 arch/arm64/boot/dts/broadcom/northstar2/Makefile |  3 --
 arch/arm64/boot/dts/broadcom/stingray/Makefile   |  3 --
 arch/arm64/boot/dts/cavium/Makefile  |  3 --
 arch/arm64/boot/dts/exynos/Makefile  |  3 --
 arch/arm64/boot/dts/freescale/Makefile   |  3 --
 arch/arm64/boot/dts/hisilicon/Makefile   |  3 --
 arch/arm64/boot/dts/lg/Makefile  |  3 --
 arch/arm64/boot/dts/marvell/Makefile |  3 --
 arch/arm64/boot/dts/mediatek/Makefile|  3 --
 arch/arm64/boot/dts/nvidia/Makefile  |  2 -
 arch/arm64/boot/dts/qcom/Makefile|  3 --
 arch/arm64/boot/dts/realtek/Makefile |  3 --
 arch/arm64/boot/dts/renesas/Makefile |  2 -
 arch/arm64/boot/dts/rockchip/Makefile|  3 --
 arch/arm64/boot/dts/socionext/Makefile   |  2 -
 arch/arm64/boot/dts/sprd/Makefile|  3 --
 arch/arm64/boot/dts/xilinx/Makefile  |  3 --
 arch/arm64/boot/dts/zte/Makefile |  3 --
 arch/h8300/boot/dts/Makefile |  5 --
 arch/metag/boot/dts/Makefile |  5 --
 arch/mips/boot/dts/Makefile  | 32 ++---
 arch/mips/boot/dts/brcm/Makefile |  2 -
 arch/mips/boot/dts/cavium-octeon/Makefile|  2 -
 arch/mips/boot/dts/img/Makefile  |  2 -
 arch/mips/boot/dts/ingenic/Makefile  |  2 -
 arch/mips/boot/dts/lantiq/Makefile   |  2 -
 arch/mips/boot/dts/mti/Makefile  |  2 -
 arch/mips/boot/dts/netlogic/Makefile |  2 -
 arch/mips/boot/dts/ni/Makefile   |  2 -
 arch/mips/boot/dts/pic32/Makefile|  2 -
 arch/mips/boot/dts/qca/Makefile  |  2 -
 arch/mips/boot/dts/ralink/Makefile   |  2 -
 arch/mips/boot/dts/xilfpga/Makefile  |  2 -
 arch/xtensa/boot/dts/Makefile|  7 ++-
 scripts/Makefile.dtbinst |  6 +--
 scripts/Makefile.lib |  5 ++
 48 files changed, 53 insertions(+), 182 deletions(-)

diff --git a/arch/arc/boot/dts/Makefile b/arch/arc/boot/dts/Makefile
index 1257db1..9ece28b 100644
--- a/arch/arc/boot/dts/Makefile
+++ b/arch/arc/boot/dts/Makefile
@@ -10,7 +10,6 @@ dtb-y := $(builtindtb-y).dtb
 
 .SECONDARY: $(obj)/$(builtindtb-y).dtb.S
 
-dtstree:= $(srctree)/$(src)
-dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard 
$(dtstree)/*.dts))
-
-always := $(dtb-y)
+# for CONFIG_OF_ALL_DTBS test
+dtstree:= $(srctree)/$(src)
+dtb-   := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts))
diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 5eeefbc..4b650d6 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -1069,8 +1069,3 @@ dtb-$(CONFIG_ARCH_ASPEED) += aspeed-bmc-opp-palmetto.dtb \
aspeed-bmc-opp-romulus.dtb \
aspeed-ast2500-evb.dtb
 endif
-
-dtstree:= $(srctree)/$(src)
-dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard 
$(dtstree)/*.dts))
-
-always := $(dtb-y)
diff --git a/arch/arm64/boot/dts/Makefile b/arch/arm64/boot/dts/Makefile
index 8e19512..a7ecb42 100644
--- a/arch/arm64/boot/dts/Makefile
+++ 

[PATCH 1/2] MIPS: dts: remove bogus bcm96358nb4ser.dtb from dtb-y entry

2017-11-04 Thread Masahiro Yamada
arch/mips/boot/dts/brcm/bcm96358nb4ser.dts does not exist, so
we cannot build bcm96358nb4ser.dtb .

Signed-off-by: Masahiro Yamada 
---

 arch/mips/boot/dts/brcm/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/mips/boot/dts/brcm/Makefile b/arch/mips/boot/dts/brcm/Makefile
index 69a69d1..ad76130 100644
--- a/arch/mips/boot/dts/brcm/Makefile
+++ b/arch/mips/boot/dts/brcm/Makefile
@@ -22,7 +22,6 @@ dtb-$(CONFIG_DT_NONE) += \
bcm63268-comtrend-vr-3032u.dtb \
bcm93384wvg.dtb \
bcm93384wvg_viper.dtb \
-   bcm96358nb4ser.dtb \
bcm96368mvwg.dtb \
bcm9ejtagprb.dtb \
bcm97125cbmb.dtb \
-- 
2.7.4



[PATCH 0/2] kbuild: move dtb-y and CONFIG_OF_ALL_DTBS to Kbuild core to fix some issues

2017-11-04 Thread Masahiro Yamada

This series applies on top of my previous cleanup patch:
https://patchwork.kernel.org/patch/10038167/

This series must go to the same branch, so I am sending this to DT ML.



Masahiro Yamada (2):
  MIPS: dts: remove bogus bcm96358nb4ser.dtb from dtb-y entry
  kbuild: handle dtb-y and CONFIG_OF_ALL_DTBS natively in Makefile.lib

 arch/arc/boot/dts/Makefile   |  7 ++-
 arch/arm/boot/dts/Makefile   |  5 --
 arch/arm64/boot/dts/Makefile | 58 ++--
 arch/arm64/boot/dts/actions/Makefile |  3 --
 arch/arm64/boot/dts/al/Makefile  |  3 --
 arch/arm64/boot/dts/allwinner/Makefile   |  3 --
 arch/arm64/boot/dts/altera/Makefile  |  3 --
 arch/arm64/boot/dts/amd/Makefile |  3 --
 arch/arm64/boot/dts/amlogic/Makefile |  3 --
 arch/arm64/boot/dts/apm/Makefile |  3 --
 arch/arm64/boot/dts/arm/Makefile |  3 --
 arch/arm64/boot/dts/broadcom/Makefile|  6 +--
 arch/arm64/boot/dts/broadcom/northstar2/Makefile |  3 --
 arch/arm64/boot/dts/broadcom/stingray/Makefile   |  3 --
 arch/arm64/boot/dts/cavium/Makefile  |  3 --
 arch/arm64/boot/dts/exynos/Makefile  |  3 --
 arch/arm64/boot/dts/freescale/Makefile   |  3 --
 arch/arm64/boot/dts/hisilicon/Makefile   |  3 --
 arch/arm64/boot/dts/lg/Makefile  |  3 --
 arch/arm64/boot/dts/marvell/Makefile |  3 --
 arch/arm64/boot/dts/mediatek/Makefile|  3 --
 arch/arm64/boot/dts/nvidia/Makefile  |  2 -
 arch/arm64/boot/dts/qcom/Makefile|  3 --
 arch/arm64/boot/dts/realtek/Makefile |  3 --
 arch/arm64/boot/dts/renesas/Makefile |  2 -
 arch/arm64/boot/dts/rockchip/Makefile|  3 --
 arch/arm64/boot/dts/socionext/Makefile   |  2 -
 arch/arm64/boot/dts/sprd/Makefile|  3 --
 arch/arm64/boot/dts/xilinx/Makefile  |  3 --
 arch/arm64/boot/dts/zte/Makefile |  3 --
 arch/h8300/boot/dts/Makefile |  5 --
 arch/metag/boot/dts/Makefile |  5 --
 arch/mips/boot/dts/Makefile  | 32 ++---
 arch/mips/boot/dts/brcm/Makefile |  3 --
 arch/mips/boot/dts/cavium-octeon/Makefile|  2 -
 arch/mips/boot/dts/img/Makefile  |  2 -
 arch/mips/boot/dts/ingenic/Makefile  |  2 -
 arch/mips/boot/dts/lantiq/Makefile   |  2 -
 arch/mips/boot/dts/mti/Makefile  |  2 -
 arch/mips/boot/dts/netlogic/Makefile |  2 -
 arch/mips/boot/dts/ni/Makefile   |  2 -
 arch/mips/boot/dts/pic32/Makefile|  2 -
 arch/mips/boot/dts/qca/Makefile  |  2 -
 arch/mips/boot/dts/ralink/Makefile   |  2 -
 arch/mips/boot/dts/xilfpga/Makefile  |  2 -
 arch/xtensa/boot/dts/Makefile|  7 ++-
 scripts/Makefile.dtbinst |  6 +--
 scripts/Makefile.lib |  5 ++
 48 files changed, 53 insertions(+), 183 deletions(-)

-- 
2.7.4



[PATCH 2/2] kbuild: handle dtb-y and CONFIG_OF_ALL_DTBS natively in Makefile.lib

2017-11-04 Thread Masahiro Yamada
If CONFIG_OF_ALL_DTBS is enabled, "make ARCH=arm64 dtbs" compiles each
DTB twice; one from arch/arm64/boot/dts/*/Makefile and the other from
the dtb-$(CONFIG_OF_ALL_DTBS) line in arch/arm64/boot/dts/Makefile.
It could be a race problem when building DTBS in parallel.

Another minor issue is CONFIG_OF_ALL_DTBS covers only *.dts in vendor
sub-directories, so this broke when Broadcom added one more hierarchy
in arch/arm64/boot/dts/broadcom//.

One idea to fix the issues in a clean way is to move DTB handling
to Kbuild core scripts.  Makefile.dtbinst already recognizes dtb-y
natively, so it should not hurt to do so.

Add $(dtb-y) to extra-y, and $(dtb-) as well if CONFIG_OF_ALL_DTBS is
enabled.  All clutter things in Makefiles go away.

As a bonus clean-up, I also removed dts-dirs.  Just use subdir-y
directly to traverse sub-directories.

Signed-off-by: Masahiro Yamada 
---

 arch/arc/boot/dts/Makefile   |  7 ++-
 arch/arm/boot/dts/Makefile   |  5 --
 arch/arm64/boot/dts/Makefile | 58 ++--
 arch/arm64/boot/dts/actions/Makefile |  3 --
 arch/arm64/boot/dts/al/Makefile  |  3 --
 arch/arm64/boot/dts/allwinner/Makefile   |  3 --
 arch/arm64/boot/dts/altera/Makefile  |  3 --
 arch/arm64/boot/dts/amd/Makefile |  3 --
 arch/arm64/boot/dts/amlogic/Makefile |  3 --
 arch/arm64/boot/dts/apm/Makefile |  3 --
 arch/arm64/boot/dts/arm/Makefile |  3 --
 arch/arm64/boot/dts/broadcom/Makefile|  6 +--
 arch/arm64/boot/dts/broadcom/northstar2/Makefile |  3 --
 arch/arm64/boot/dts/broadcom/stingray/Makefile   |  3 --
 arch/arm64/boot/dts/cavium/Makefile  |  3 --
 arch/arm64/boot/dts/exynos/Makefile  |  3 --
 arch/arm64/boot/dts/freescale/Makefile   |  3 --
 arch/arm64/boot/dts/hisilicon/Makefile   |  3 --
 arch/arm64/boot/dts/lg/Makefile  |  3 --
 arch/arm64/boot/dts/marvell/Makefile |  3 --
 arch/arm64/boot/dts/mediatek/Makefile|  3 --
 arch/arm64/boot/dts/nvidia/Makefile  |  2 -
 arch/arm64/boot/dts/qcom/Makefile|  3 --
 arch/arm64/boot/dts/realtek/Makefile |  3 --
 arch/arm64/boot/dts/renesas/Makefile |  2 -
 arch/arm64/boot/dts/rockchip/Makefile|  3 --
 arch/arm64/boot/dts/socionext/Makefile   |  2 -
 arch/arm64/boot/dts/sprd/Makefile|  3 --
 arch/arm64/boot/dts/xilinx/Makefile  |  3 --
 arch/arm64/boot/dts/zte/Makefile |  3 --
 arch/h8300/boot/dts/Makefile |  5 --
 arch/metag/boot/dts/Makefile |  5 --
 arch/mips/boot/dts/Makefile  | 32 ++---
 arch/mips/boot/dts/brcm/Makefile |  2 -
 arch/mips/boot/dts/cavium-octeon/Makefile|  2 -
 arch/mips/boot/dts/img/Makefile  |  2 -
 arch/mips/boot/dts/ingenic/Makefile  |  2 -
 arch/mips/boot/dts/lantiq/Makefile   |  2 -
 arch/mips/boot/dts/mti/Makefile  |  2 -
 arch/mips/boot/dts/netlogic/Makefile |  2 -
 arch/mips/boot/dts/ni/Makefile   |  2 -
 arch/mips/boot/dts/pic32/Makefile|  2 -
 arch/mips/boot/dts/qca/Makefile  |  2 -
 arch/mips/boot/dts/ralink/Makefile   |  2 -
 arch/mips/boot/dts/xilfpga/Makefile  |  2 -
 arch/xtensa/boot/dts/Makefile|  7 ++-
 scripts/Makefile.dtbinst |  6 +--
 scripts/Makefile.lib |  5 ++
 48 files changed, 53 insertions(+), 182 deletions(-)

diff --git a/arch/arc/boot/dts/Makefile b/arch/arc/boot/dts/Makefile
index 1257db1..9ece28b 100644
--- a/arch/arc/boot/dts/Makefile
+++ b/arch/arc/boot/dts/Makefile
@@ -10,7 +10,6 @@ dtb-y := $(builtindtb-y).dtb
 
 .SECONDARY: $(obj)/$(builtindtb-y).dtb.S
 
-dtstree:= $(srctree)/$(src)
-dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard 
$(dtstree)/*.dts))
-
-always := $(dtb-y)
+# for CONFIG_OF_ALL_DTBS test
+dtstree:= $(srctree)/$(src)
+dtb-   := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts))
diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 5eeefbc..4b650d6 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -1069,8 +1069,3 @@ dtb-$(CONFIG_ARCH_ASPEED) += aspeed-bmc-opp-palmetto.dtb \
aspeed-bmc-opp-romulus.dtb \
aspeed-ast2500-evb.dtb
 endif
-
-dtstree:= $(srctree)/$(src)
-dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard 
$(dtstree)/*.dts))
-
-always := $(dtb-y)
diff --git a/arch/arm64/boot/dts/Makefile b/arch/arm64/boot/dts/Makefile
index 8e19512..a7ecb42 100644
--- a/arch/arm64/boot/dts/Makefile
+++ b/arch/arm64/boot/dts/Makefile
@@ -1,33 +1,25 @@

[PATCH 1/2] MIPS: dts: remove bogus bcm96358nb4ser.dtb from dtb-y entry

2017-11-04 Thread Masahiro Yamada
arch/mips/boot/dts/brcm/bcm96358nb4ser.dts does not exist, so
we cannot build bcm96358nb4ser.dtb .

Signed-off-by: Masahiro Yamada 
---

 arch/mips/boot/dts/brcm/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/mips/boot/dts/brcm/Makefile b/arch/mips/boot/dts/brcm/Makefile
index 69a69d1..ad76130 100644
--- a/arch/mips/boot/dts/brcm/Makefile
+++ b/arch/mips/boot/dts/brcm/Makefile
@@ -22,7 +22,6 @@ dtb-$(CONFIG_DT_NONE) += \
bcm63268-comtrend-vr-3032u.dtb \
bcm93384wvg.dtb \
bcm93384wvg_viper.dtb \
-   bcm96358nb4ser.dtb \
bcm96368mvwg.dtb \
bcm9ejtagprb.dtb \
bcm97125cbmb.dtb \
-- 
2.7.4



[PATCH 2/5] tty: serial: jsm: add blank line after declarations

2017-11-04 Thread Gimcuan Hui
This patch fixes checkpatch.pl warning:

Missing a blank line after declarations.

Signed-off-by: Gimcuan Hui 
---
 drivers/tty/serial/jsm/jsm_tty.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/tty/serial/jsm/jsm_tty.c b/drivers/tty/serial/jsm/jsm_tty.c
index 729d2a083a74..ed58dfc3d40e 100644
--- a/drivers/tty/serial/jsm/jsm_tty.c
+++ b/drivers/tty/serial/jsm/jsm_tty.c
@@ -124,6 +124,7 @@ static void jsm_tty_set_mctrl(struct uart_port *port, 
unsigned int mctrl)
 static void jsm_tty_write(struct uart_port *port)
 {
struct jsm_channel *channel;
+
channel = container_of(port, struct jsm_channel, uart_port);
channel->ch_bd->bd_ops->copy_data_from_queue_to_uart(channel);
 }
-- 
2.11.0



[PATCH 2/5] tty: serial: jsm: add blank line after declarations

2017-11-04 Thread Gimcuan Hui
This patch fixes checkpatch.pl warning:

Missing a blank line after declarations.

Signed-off-by: Gimcuan Hui 
---
 drivers/tty/serial/jsm/jsm_tty.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/tty/serial/jsm/jsm_tty.c b/drivers/tty/serial/jsm/jsm_tty.c
index 729d2a083a74..ed58dfc3d40e 100644
--- a/drivers/tty/serial/jsm/jsm_tty.c
+++ b/drivers/tty/serial/jsm/jsm_tty.c
@@ -124,6 +124,7 @@ static void jsm_tty_set_mctrl(struct uart_port *port, 
unsigned int mctrl)
 static void jsm_tty_write(struct uart_port *port)
 {
struct jsm_channel *channel;
+
channel = container_of(port, struct jsm_channel, uart_port);
channel->ch_bd->bd_ops->copy_data_from_queue_to_uart(channel);
 }
-- 
2.11.0



[PATCH 4/5] tty: serial: jsm: fix coding style

2017-11-04 Thread Gimcuan Hui
This patch fixes the checkpatch.pl complain:

ERROR: else should follow close brace '}'.

Signed-off-by: Gimcuan Hui 
---
 drivers/tty/serial/jsm/jsm_tty.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/tty/serial/jsm/jsm_tty.c b/drivers/tty/serial/jsm/jsm_tty.c
index 53c3f53da241..3853bfa5aa46 100644
--- a/drivers/tty/serial/jsm/jsm_tty.c
+++ b/drivers/tty/serial/jsm/jsm_tty.c
@@ -478,8 +478,7 @@ int jsm_uart_port_init(struct jsm_board *brd)
if (rc) {
printk(KERN_INFO "jsm: Port %d failed. Aborting...\n", 
i);
return rc;
-   }
-   else
+   } else
printk(KERN_INFO "jsm: Port %d added\n", i);
}
 
-- 
2.11.0



[PATCH 5/5] tty: serial: jsm: add space before the open parenthesis '('

2017-11-04 Thread Gimcuan Hui
This patch fixes the checkpatch.pl complains:

space required before the open parenthesis '('.

Signed-off-by: Gimcuan Hui 
---
 drivers/tty/serial/jsm/jsm_tty.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/tty/serial/jsm/jsm_tty.c b/drivers/tty/serial/jsm/jsm_tty.c
index 3853bfa5aa46..cbbadafe61fb 100644
--- a/drivers/tty/serial/jsm/jsm_tty.c
+++ b/drivers/tty/serial/jsm/jsm_tty.c
@@ -541,7 +541,7 @@ void jsm_input(struct jsm_channel *ch)
tp = port->tty;
 
bd = ch->ch_bd;
-   if(!bd)
+   if (!bd)
return;
 
spin_lock_irqsave(>ch_lock, lock_flags);
@@ -781,7 +781,7 @@ void jsm_check_queue_flow_control(struct jsm_channel *ch)
if (qleft < 256) {
/* HWFLOW */
if (ch->ch_c_cflag & CRTSCTS) {
-   if(!(ch->ch_flags & CH_RECEIVER_OFF)) {
+   if (!(ch->ch_flags & CH_RECEIVER_OFF)) {
bd_ops->disable_receiver(ch);
ch->ch_flags |= (CH_RECEIVER_OFF);
jsm_dbg(READ, >ch_bd->pci_dev,
-- 
2.11.0



[PATCH 4/5] tty: serial: jsm: fix coding style

2017-11-04 Thread Gimcuan Hui
This patch fixes the checkpatch.pl complain:

ERROR: else should follow close brace '}'.

Signed-off-by: Gimcuan Hui 
---
 drivers/tty/serial/jsm/jsm_tty.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/tty/serial/jsm/jsm_tty.c b/drivers/tty/serial/jsm/jsm_tty.c
index 53c3f53da241..3853bfa5aa46 100644
--- a/drivers/tty/serial/jsm/jsm_tty.c
+++ b/drivers/tty/serial/jsm/jsm_tty.c
@@ -478,8 +478,7 @@ int jsm_uart_port_init(struct jsm_board *brd)
if (rc) {
printk(KERN_INFO "jsm: Port %d failed. Aborting...\n", 
i);
return rc;
-   }
-   else
+   } else
printk(KERN_INFO "jsm: Port %d added\n", i);
}
 
-- 
2.11.0



[PATCH 5/5] tty: serial: jsm: add space before the open parenthesis '('

2017-11-04 Thread Gimcuan Hui
This patch fixes the checkpatch.pl complains:

space required before the open parenthesis '('.

Signed-off-by: Gimcuan Hui 
---
 drivers/tty/serial/jsm/jsm_tty.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/tty/serial/jsm/jsm_tty.c b/drivers/tty/serial/jsm/jsm_tty.c
index 3853bfa5aa46..cbbadafe61fb 100644
--- a/drivers/tty/serial/jsm/jsm_tty.c
+++ b/drivers/tty/serial/jsm/jsm_tty.c
@@ -541,7 +541,7 @@ void jsm_input(struct jsm_channel *ch)
tp = port->tty;
 
bd = ch->ch_bd;
-   if(!bd)
+   if (!bd)
return;
 
spin_lock_irqsave(>ch_lock, lock_flags);
@@ -781,7 +781,7 @@ void jsm_check_queue_flow_control(struct jsm_channel *ch)
if (qleft < 256) {
/* HWFLOW */
if (ch->ch_c_cflag & CRTSCTS) {
-   if(!(ch->ch_flags & CH_RECEIVER_OFF)) {
+   if (!(ch->ch_flags & CH_RECEIVER_OFF)) {
bd_ops->disable_receiver(ch);
ch->ch_flags |= (CH_RECEIVER_OFF);
jsm_dbg(READ, >ch_bd->pci_dev,
-- 
2.11.0



[PATCH 3/5] tty: serial: jsm: delete space between function name and '('

2017-11-04 Thread Gimcuan Hui
This patch fixes checkpatch.pl warning:

space prohibited between function name and open parenthesis '('.

Signed-off-by: Gimcuan Hui 
---
 drivers/tty/serial/jsm/jsm_tty.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/serial/jsm/jsm_tty.c b/drivers/tty/serial/jsm/jsm_tty.c
index ed58dfc3d40e..53c3f53da241 100644
--- a/drivers/tty/serial/jsm/jsm_tty.c
+++ b/drivers/tty/serial/jsm/jsm_tty.c
@@ -474,7 +474,7 @@ int jsm_uart_port_init(struct jsm_board *brd)
} else
set_bit(line, linemap);
brd->channels[i]->uart_port.line = line;
-   rc = uart_add_one_port (_uart_driver, 
>channels[i]->uart_port);
+   rc = uart_add_one_port(_uart_driver, 
>channels[i]->uart_port);
if (rc) {
printk(KERN_INFO "jsm: Port %d failed. Aborting...\n", 
i);
return rc;
-- 
2.11.0



[PATCH 0/5] TTY/JSM coding style fixes

2017-11-04 Thread Gimcuan Hui
This patch set fixes several warnings and errors reported
by checkpatch.pl.

Gimcuan Hui (5):
  tty: serial: jsm: change the type of local variable
  tty: serial: jsm: add blank line after declarations
  tty: serial: jsm: delete space between function name and '('
  tty: serial: jsm: fix coding style
  tty: serial: jsm: add space before the open parenthesis '('

 drivers/tty/serial/jsm/jsm_tty.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

-- 
2.11.0



[PATCH 1/5] tty: serial: jsm: change the type of local variable

2017-11-04 Thread Gimcuan Hui
The return type of jsm_get_mstat was int, and the local var result
was for the return should be int, make the change.

This patch fixes the checkpatch.pl warning:
Prefer 'unsigned int' to bare use of 'unsigned'.

Signed-off-by: Gimcuan Hui 
---
 drivers/tty/serial/jsm/jsm_tty.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/serial/jsm/jsm_tty.c b/drivers/tty/serial/jsm/jsm_tty.c
index e69227cc3827..729d2a083a74 100644
--- a/drivers/tty/serial/jsm/jsm_tty.c
+++ b/drivers/tty/serial/jsm/jsm_tty.c
@@ -36,7 +36,7 @@ static void jsm_carrier(struct jsm_channel *ch);
 static inline int jsm_get_mstat(struct jsm_channel *ch)
 {
unsigned char mstat;
-   unsigned result;
+   int result;
 
jsm_dbg(IOCTL, >ch_bd->pci_dev, "start\n");
 
-- 
2.11.0



[PATCH 3/5] tty: serial: jsm: delete space between function name and '('

2017-11-04 Thread Gimcuan Hui
This patch fixes checkpatch.pl warning:

space prohibited between function name and open parenthesis '('.

Signed-off-by: Gimcuan Hui 
---
 drivers/tty/serial/jsm/jsm_tty.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/serial/jsm/jsm_tty.c b/drivers/tty/serial/jsm/jsm_tty.c
index ed58dfc3d40e..53c3f53da241 100644
--- a/drivers/tty/serial/jsm/jsm_tty.c
+++ b/drivers/tty/serial/jsm/jsm_tty.c
@@ -474,7 +474,7 @@ int jsm_uart_port_init(struct jsm_board *brd)
} else
set_bit(line, linemap);
brd->channels[i]->uart_port.line = line;
-   rc = uart_add_one_port (_uart_driver, 
>channels[i]->uart_port);
+   rc = uart_add_one_port(_uart_driver, 
>channels[i]->uart_port);
if (rc) {
printk(KERN_INFO "jsm: Port %d failed. Aborting...\n", 
i);
return rc;
-- 
2.11.0



[PATCH 0/5] TTY/JSM coding style fixes

2017-11-04 Thread Gimcuan Hui
This patch set fixes several warnings and errors reported
by checkpatch.pl.

Gimcuan Hui (5):
  tty: serial: jsm: change the type of local variable
  tty: serial: jsm: add blank line after declarations
  tty: serial: jsm: delete space between function name and '('
  tty: serial: jsm: fix coding style
  tty: serial: jsm: add space before the open parenthesis '('

 drivers/tty/serial/jsm/jsm_tty.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

-- 
2.11.0



[PATCH 1/5] tty: serial: jsm: change the type of local variable

2017-11-04 Thread Gimcuan Hui
The return type of jsm_get_mstat was int, and the local var result
was for the return should be int, make the change.

This patch fixes the checkpatch.pl warning:
Prefer 'unsigned int' to bare use of 'unsigned'.

Signed-off-by: Gimcuan Hui 
---
 drivers/tty/serial/jsm/jsm_tty.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/serial/jsm/jsm_tty.c b/drivers/tty/serial/jsm/jsm_tty.c
index e69227cc3827..729d2a083a74 100644
--- a/drivers/tty/serial/jsm/jsm_tty.c
+++ b/drivers/tty/serial/jsm/jsm_tty.c
@@ -36,7 +36,7 @@ static void jsm_carrier(struct jsm_channel *ch);
 static inline int jsm_get_mstat(struct jsm_channel *ch)
 {
unsigned char mstat;
-   unsigned result;
+   int result;
 
jsm_dbg(IOCTL, >ch_bd->pci_dev, "start\n");
 
-- 
2.11.0



[PATCH] thunderbolt: tb: fix use after free in tb_activate_pcie_devices

2017-11-04 Thread Gustavo A. R. Silva
Add a ̣̣continue statement in order to avoid using a previously
free'd pointer tunnel in list_add.

Addresses-Coverity-ID: 1415336
Fixes: 9d3cce0b6136 ("thunderbolt: Introduce thunderbolt bus and connection 
manager")
Signed-off-by: Gustavo A. R. Silva 
---
 drivers/thunderbolt/tb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
index d674e06..1424581 100644
--- a/drivers/thunderbolt/tb.c
+++ b/drivers/thunderbolt/tb.c
@@ -225,6 +225,7 @@ static void tb_activate_pcie_devices(struct tb *tb)
tb_port_info(up_port,
 "PCIe tunnel activation failed, 
aborting\n");
tb_pci_free(tunnel);
+   continue;
}
 
list_add(>list, >tunnel_list);
-- 
2.7.4



[PATCH] thunderbolt: tb: fix use after free in tb_activate_pcie_devices

2017-11-04 Thread Gustavo A. R. Silva
Add a ̣̣continue statement in order to avoid using a previously
free'd pointer tunnel in list_add.

Addresses-Coverity-ID: 1415336
Fixes: 9d3cce0b6136 ("thunderbolt: Introduce thunderbolt bus and connection 
manager")
Signed-off-by: Gustavo A. R. Silva 
---
 drivers/thunderbolt/tb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
index d674e06..1424581 100644
--- a/drivers/thunderbolt/tb.c
+++ b/drivers/thunderbolt/tb.c
@@ -225,6 +225,7 @@ static void tb_activate_pcie_devices(struct tb *tb)
tb_port_info(up_port,
 "PCIe tunnel activation failed, 
aborting\n");
tb_pci_free(tunnel);
+   continue;
}
 
list_add(>list, >tunnel_list);
-- 
2.7.4



Re: [PATCH] net/mlx5e/core/en_fs: fix pointer dereference after free in mlx5e_execute_l2_action

2017-11-04 Thread Saeed Mahameed
On Sat, Nov 4, 2017 at 8:54 PM, Gustavo A. R. Silva
 wrote:
> hn is being kfree'd in mlx5e_del_l2_from_hash and then dereferenced
> by accessing hn->ai.addr
>
> Fix this by copying the MAC address into a local variable for its safe use
> in all possible execution paths within function mlx5e_execute_l2_action.
>
> Addresses-Coverity-ID: 1417789
> Fixes: eeb66cdb6826 ("net/mlx5: Separate between E-Switch and MPFS")
> Signed-off-by: Gustavo A. R. Silva 

Acked-by: Saeed Mahameed 

Looks good.
Thank you Gustavo.

> ---
>  drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 13 -
>  1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
> index 850cdc9..4837045 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
> @@ -365,21 +365,24 @@ static void mlx5e_execute_l2_action(struct mlx5e_priv 
> *priv,
> struct mlx5e_l2_hash_node *hn)
>  {
> u8 action = hn->action;
> +   u8 mac_addr[ETH_ALEN];
> int l2_err = 0;
>
> +   ether_addr_copy(mac_addr, hn->ai.addr);
> +
> switch (action) {
> case MLX5E_ACTION_ADD:
> mlx5e_add_l2_flow_rule(priv, >ai, MLX5E_FULLMATCH);
> -   if (!is_multicast_ether_addr(hn->ai.addr)) {
> -   l2_err = mlx5_mpfs_add_mac(priv->mdev, hn->ai.addr);
> +   if (!is_multicast_ether_addr(mac_addr)) {
> +   l2_err = mlx5_mpfs_add_mac(priv->mdev, mac_addr);
> hn->mpfs = !l2_err;
> }
> hn->action = MLX5E_ACTION_NONE;
> break;
>
> case MLX5E_ACTION_DEL:
> -   if (!is_multicast_ether_addr(hn->ai.addr) && hn->mpfs)
> -   l2_err = mlx5_mpfs_del_mac(priv->mdev, hn->ai.addr);
> +   if (!is_multicast_ether_addr(mac_addr) && hn->mpfs)
> +   l2_err = mlx5_mpfs_del_mac(priv->mdev, mac_addr);
> mlx5e_del_l2_flow_rule(priv, >ai);
> mlx5e_del_l2_from_hash(hn);
> break;
> @@ -387,7 +390,7 @@ static void mlx5e_execute_l2_action(struct mlx5e_priv 
> *priv,
>
> if (l2_err)
> netdev_warn(priv->netdev, "MPFS, failed to %s mac %pM, 
> err(%d)\n",
> -   action == MLX5E_ACTION_ADD ? "add" : "del", 
> hn->ai.addr, l2_err);
> +   action == MLX5E_ACTION_ADD ? "add" : "del", 
> mac_addr, l2_err);
>  }
>
>  static void mlx5e_sync_netdev_addr(struct mlx5e_priv *priv)
> --
> 2.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] net/mlx5e/core/en_fs: fix pointer dereference after free in mlx5e_execute_l2_action

2017-11-04 Thread Saeed Mahameed
On Sat, Nov 4, 2017 at 8:54 PM, Gustavo A. R. Silva
 wrote:
> hn is being kfree'd in mlx5e_del_l2_from_hash and then dereferenced
> by accessing hn->ai.addr
>
> Fix this by copying the MAC address into a local variable for its safe use
> in all possible execution paths within function mlx5e_execute_l2_action.
>
> Addresses-Coverity-ID: 1417789
> Fixes: eeb66cdb6826 ("net/mlx5: Separate between E-Switch and MPFS")
> Signed-off-by: Gustavo A. R. Silva 

Acked-by: Saeed Mahameed 

Looks good.
Thank you Gustavo.

> ---
>  drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 13 -
>  1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
> index 850cdc9..4837045 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
> @@ -365,21 +365,24 @@ static void mlx5e_execute_l2_action(struct mlx5e_priv 
> *priv,
> struct mlx5e_l2_hash_node *hn)
>  {
> u8 action = hn->action;
> +   u8 mac_addr[ETH_ALEN];
> int l2_err = 0;
>
> +   ether_addr_copy(mac_addr, hn->ai.addr);
> +
> switch (action) {
> case MLX5E_ACTION_ADD:
> mlx5e_add_l2_flow_rule(priv, >ai, MLX5E_FULLMATCH);
> -   if (!is_multicast_ether_addr(hn->ai.addr)) {
> -   l2_err = mlx5_mpfs_add_mac(priv->mdev, hn->ai.addr);
> +   if (!is_multicast_ether_addr(mac_addr)) {
> +   l2_err = mlx5_mpfs_add_mac(priv->mdev, mac_addr);
> hn->mpfs = !l2_err;
> }
> hn->action = MLX5E_ACTION_NONE;
> break;
>
> case MLX5E_ACTION_DEL:
> -   if (!is_multicast_ether_addr(hn->ai.addr) && hn->mpfs)
> -   l2_err = mlx5_mpfs_del_mac(priv->mdev, hn->ai.addr);
> +   if (!is_multicast_ether_addr(mac_addr) && hn->mpfs)
> +   l2_err = mlx5_mpfs_del_mac(priv->mdev, mac_addr);
> mlx5e_del_l2_flow_rule(priv, >ai);
> mlx5e_del_l2_from_hash(hn);
> break;
> @@ -387,7 +390,7 @@ static void mlx5e_execute_l2_action(struct mlx5e_priv 
> *priv,
>
> if (l2_err)
> netdev_warn(priv->netdev, "MPFS, failed to %s mac %pM, 
> err(%d)\n",
> -   action == MLX5E_ACTION_ADD ? "add" : "del", 
> hn->ai.addr, l2_err);
> +   action == MLX5E_ACTION_ADD ? "add" : "del", 
> mac_addr, l2_err);
>  }
>
>  static void mlx5e_sync_netdev_addr(struct mlx5e_priv *priv)
> --
> 2.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4] tty: serial: meson: allow baud-rates lower than 9600

2017-11-04 Thread Thomas Rohloff

Devices like DCF77 receivers need the baud-rate to be as low as 50.

I have tested this on a Meson GXL device with uart_A.

Cc: Greg Kroah-Hartman 
Cc: Jiri Slaby 
Cc: Carlo Caione 
Cc: Kevin Hilman 
Cc: linux-amlo...@lists.infradead.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Thomas Rohloff 
---
drivers/tty/serial/meson_uart.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/serial/meson_uart.c 
b/drivers/tty/serial/meson_uart.c

index 07c0f98be3ac..e281ce5d101d 100644
--- a/drivers/tty/serial/meson_uart.c
+++ b/drivers/tty/serial/meson_uart.c
@@ -362,7 +362,7 @@ static void meson_uart_set_termios(struct uart_port 
*port,


writel(val, port->membase + AML_UART_CONTROL);

-   baud = uart_get_baud_rate(port, termios, old, 9600, 400);
+   baud = uart_get_baud_rate(port, termios, old, 50, 400);
meson_uart_change_speed(port, baud);

port->read_status_mask = AML_UART_TX_FIFO_WERR;
--
2.13.6




[PATCH v4] tty: serial: meson: allow baud-rates lower than 9600

2017-11-04 Thread Thomas Rohloff

Devices like DCF77 receivers need the baud-rate to be as low as 50.

I have tested this on a Meson GXL device with uart_A.

Cc: Greg Kroah-Hartman 
Cc: Jiri Slaby 
Cc: Carlo Caione 
Cc: Kevin Hilman 
Cc: linux-amlo...@lists.infradead.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Thomas Rohloff 
---
drivers/tty/serial/meson_uart.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/serial/meson_uart.c 
b/drivers/tty/serial/meson_uart.c

index 07c0f98be3ac..e281ce5d101d 100644
--- a/drivers/tty/serial/meson_uart.c
+++ b/drivers/tty/serial/meson_uart.c
@@ -362,7 +362,7 @@ static void meson_uart_set_termios(struct uart_port 
*port,


writel(val, port->membase + AML_UART_CONTROL);

-   baud = uart_get_baud_rate(port, termios, old, 9600, 400);
+   baud = uart_get_baud_rate(port, termios, old, 50, 400);
meson_uart_change_speed(port, baud);

port->read_status_mask = AML_UART_TX_FIFO_WERR;
--
2.13.6




Re: [PATCH 3.10 000/139] 3.10.108-stable review

2017-11-04 Thread Levin, Alexander (Sasha Levin)
On Thu, Nov 02, 2017 at 07:12:00AM +0100, Willy Tarreau wrote:
>Hi Guenter,
>
>On Wed, Nov 01, 2017 at 06:21:13PM -0700, Guenter Roeck wrote:
>> drivers/s390/scsi/zfcp_scsi.c: In function 'zfcp_task_mgmt_function':
>> drivers/s390/scsi/zfcp_scsi.c:298:4: error: too many arguments to function 
>> 'zfcp_dbf_scsi_devreset'
>> drivers/s390/scsi/zfcp_dbf.h:408:6: note: declared here
>> drivers/s390/scsi/zfcp_scsi.c:309:3: error: too many arguments to function 
>> 'zfcp_dbf_scsi_devreset'
>> drivers/s390/scsi/zfcp_dbf.h:408:6: note: declared here
>
>Ah, I was too happy all of them applied well :-)
>
>I'm seeing that Greg dropped ",NULL" in 3.18 and that other call
>places only have the first 3 args. Now fixed.

Same for 4.1. Thanks!

-- 

Thanks,
Sasha

Re: [PATCH 3.10 000/139] 3.10.108-stable review

2017-11-04 Thread Levin, Alexander (Sasha Levin)
On Thu, Nov 02, 2017 at 07:12:00AM +0100, Willy Tarreau wrote:
>Hi Guenter,
>
>On Wed, Nov 01, 2017 at 06:21:13PM -0700, Guenter Roeck wrote:
>> drivers/s390/scsi/zfcp_scsi.c: In function 'zfcp_task_mgmt_function':
>> drivers/s390/scsi/zfcp_scsi.c:298:4: error: too many arguments to function 
>> 'zfcp_dbf_scsi_devreset'
>> drivers/s390/scsi/zfcp_dbf.h:408:6: note: declared here
>> drivers/s390/scsi/zfcp_scsi.c:309:3: error: too many arguments to function 
>> 'zfcp_dbf_scsi_devreset'
>> drivers/s390/scsi/zfcp_dbf.h:408:6: note: declared here
>
>Ah, I was too happy all of them applied well :-)
>
>I'm seeing that Greg dropped ",NULL" in 3.18 and that other call
>places only have the first 3 args. Now fixed.

Same for 4.1. Thanks!

-- 

Thanks,
Sasha

Re: [PATCH v3] tty: serial: meson: allow baud-rates lower than 9600

2017-11-04 Thread V10lator

Am Samstag, 4. November 2017 14:13:52 CET schrieb Greg Kroah-Hartman:

What changed from v2?


Nothing. I just re-based the patch on a fresh git clone as you couldn't 
merge v2.



And I need a real name as the author of the patch :(


Damn, sorry about that. Will do a v4.


Re: [PATCH v3] tty: serial: meson: allow baud-rates lower than 9600

2017-11-04 Thread V10lator

Am Samstag, 4. November 2017 14:13:52 CET schrieb Greg Kroah-Hartman:

What changed from v2?


Nothing. I just re-based the patch on a fresh git clone as you couldn't 
merge v2.



And I need a real name as the author of the patch :(


Damn, sorry about that. Will do a v4.


Re: [PATCH v3] scsi: require CAP_SYS_ADMIN to write to procfs interface

2017-11-04 Thread Aleksa Sarai

On 11/05/2017 01:56 PM, Aleksa Sarai wrote:

Previously, the only capability effectively required to operate on the
/proc/scsi interface was CAP_DAC_OVERRIDE (or for some other files,
having an fsuid of GLOBAL_ROOT_UID was enough). This means that
semi-privileged processes could interfere with core components of a
system (such as causing a DoS by removing the underlying SCSI device of
the host's / mount).


An alternative to this patch would be to make the open(2) call fail, if 
you try to open it write-only or read-write. Not sure which would be 
preferred (should it be possible to pass /proc/scsi/scsi to a 
semi-privileged process to write to?).


--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
https://www.cyphar.com/


Re: [PATCH v3] scsi: require CAP_SYS_ADMIN to write to procfs interface

2017-11-04 Thread Aleksa Sarai

On 11/05/2017 01:56 PM, Aleksa Sarai wrote:

Previously, the only capability effectively required to operate on the
/proc/scsi interface was CAP_DAC_OVERRIDE (or for some other files,
having an fsuid of GLOBAL_ROOT_UID was enough). This means that
semi-privileged processes could interfere with core components of a
system (such as causing a DoS by removing the underlying SCSI device of
the host's / mount).


An alternative to this patch would be to make the open(2) call fail, if 
you try to open it write-only or read-write. Not sure which would be 
preferred (should it be possible to pass /proc/scsi/scsi to a 
semi-privileged process to write to?).


--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
https://www.cyphar.com/


[PATCH] net/mlx5e/core/en_fs: fix pointer dereference after free in mlx5e_execute_l2_action

2017-11-04 Thread Gustavo A. R. Silva
hn is being kfree'd in mlx5e_del_l2_from_hash and then dereferenced
by accessing hn->ai.addr

Fix this by copying the MAC address into a local variable for its safe use
in all possible execution paths within function mlx5e_execute_l2_action.

Addresses-Coverity-ID: 1417789
Fixes: eeb66cdb6826 ("net/mlx5: Separate between E-Switch and MPFS")
Signed-off-by: Gustavo A. R. Silva 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 850cdc9..4837045 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -365,21 +365,24 @@ static void mlx5e_execute_l2_action(struct mlx5e_priv 
*priv,
struct mlx5e_l2_hash_node *hn)
 {
u8 action = hn->action;
+   u8 mac_addr[ETH_ALEN];
int l2_err = 0;
 
+   ether_addr_copy(mac_addr, hn->ai.addr);
+
switch (action) {
case MLX5E_ACTION_ADD:
mlx5e_add_l2_flow_rule(priv, >ai, MLX5E_FULLMATCH);
-   if (!is_multicast_ether_addr(hn->ai.addr)) {
-   l2_err = mlx5_mpfs_add_mac(priv->mdev, hn->ai.addr);
+   if (!is_multicast_ether_addr(mac_addr)) {
+   l2_err = mlx5_mpfs_add_mac(priv->mdev, mac_addr);
hn->mpfs = !l2_err;
}
hn->action = MLX5E_ACTION_NONE;
break;
 
case MLX5E_ACTION_DEL:
-   if (!is_multicast_ether_addr(hn->ai.addr) && hn->mpfs)
-   l2_err = mlx5_mpfs_del_mac(priv->mdev, hn->ai.addr);
+   if (!is_multicast_ether_addr(mac_addr) && hn->mpfs)
+   l2_err = mlx5_mpfs_del_mac(priv->mdev, mac_addr);
mlx5e_del_l2_flow_rule(priv, >ai);
mlx5e_del_l2_from_hash(hn);
break;
@@ -387,7 +390,7 @@ static void mlx5e_execute_l2_action(struct mlx5e_priv *priv,
 
if (l2_err)
netdev_warn(priv->netdev, "MPFS, failed to %s mac %pM, 
err(%d)\n",
-   action == MLX5E_ACTION_ADD ? "add" : "del", 
hn->ai.addr, l2_err);
+   action == MLX5E_ACTION_ADD ? "add" : "del", 
mac_addr, l2_err);
 }
 
 static void mlx5e_sync_netdev_addr(struct mlx5e_priv *priv)
-- 
2.7.4



[PATCH] net/mlx5e/core/en_fs: fix pointer dereference after free in mlx5e_execute_l2_action

2017-11-04 Thread Gustavo A. R. Silva
hn is being kfree'd in mlx5e_del_l2_from_hash and then dereferenced
by accessing hn->ai.addr

Fix this by copying the MAC address into a local variable for its safe use
in all possible execution paths within function mlx5e_execute_l2_action.

Addresses-Coverity-ID: 1417789
Fixes: eeb66cdb6826 ("net/mlx5: Separate between E-Switch and MPFS")
Signed-off-by: Gustavo A. R. Silva 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 850cdc9..4837045 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -365,21 +365,24 @@ static void mlx5e_execute_l2_action(struct mlx5e_priv 
*priv,
struct mlx5e_l2_hash_node *hn)
 {
u8 action = hn->action;
+   u8 mac_addr[ETH_ALEN];
int l2_err = 0;
 
+   ether_addr_copy(mac_addr, hn->ai.addr);
+
switch (action) {
case MLX5E_ACTION_ADD:
mlx5e_add_l2_flow_rule(priv, >ai, MLX5E_FULLMATCH);
-   if (!is_multicast_ether_addr(hn->ai.addr)) {
-   l2_err = mlx5_mpfs_add_mac(priv->mdev, hn->ai.addr);
+   if (!is_multicast_ether_addr(mac_addr)) {
+   l2_err = mlx5_mpfs_add_mac(priv->mdev, mac_addr);
hn->mpfs = !l2_err;
}
hn->action = MLX5E_ACTION_NONE;
break;
 
case MLX5E_ACTION_DEL:
-   if (!is_multicast_ether_addr(hn->ai.addr) && hn->mpfs)
-   l2_err = mlx5_mpfs_del_mac(priv->mdev, hn->ai.addr);
+   if (!is_multicast_ether_addr(mac_addr) && hn->mpfs)
+   l2_err = mlx5_mpfs_del_mac(priv->mdev, mac_addr);
mlx5e_del_l2_flow_rule(priv, >ai);
mlx5e_del_l2_from_hash(hn);
break;
@@ -387,7 +390,7 @@ static void mlx5e_execute_l2_action(struct mlx5e_priv *priv,
 
if (l2_err)
netdev_warn(priv->netdev, "MPFS, failed to %s mac %pM, 
err(%d)\n",
-   action == MLX5E_ACTION_ADD ? "add" : "del", 
hn->ai.addr, l2_err);
+   action == MLX5E_ACTION_ADD ? "add" : "del", 
mac_addr, l2_err);
 }
 
 static void mlx5e_sync_netdev_addr(struct mlx5e_priv *priv)
-- 
2.7.4



Re: [f2fs-dev] [PATCH 2/2] f2fs: stop all the operations by cp_error flag

2017-11-04 Thread Chao Yu
Hi Jaegeuk,

On 2017/10/24 17:51, Chao Yu wrote:
> On 2017/10/24 6:14, Jaegeuk Kim wrote:
>> This patch replaces to use cp_error flag instead of RDONLY for quota off.

We should convert error number with block_page_mkwrite_return in .page_mkwrite,
otherwise generic/019 will cause a deadlock issue with below kernel message
printed:


WARNING: possible recursive locking detected
4.14.0-rc1 #35 Tainted: GW  O

fio/5845 is trying to acquire lock:
 (>mmap_sem){}, at: [] __do_page_fault+0x482/0x510

but task is already holding lock:
 (>mmap_sem){}, at: [] __do_page_fault+0x11e/0x510

other info that might help us debug this:
 Possible unsafe locking scenario:

   CPU0
   
  lock(>mmap_sem);
  lock(>mmap_sem);

 *** DEADLOCK ***

 May be due to missing lock nesting notation

1 lock held by fio/5845:
 #0:  (>mmap_sem){}, at: [] __do_page_fault+0x11e/0x510

stack backtrace:
CPU: 3 PID: 5845 Comm: fio Tainted: GW  O4.14.0-rc1 #35
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
Call Trace:
 dump_stack+0x5f/0x92
 __lock_acquire+0x1019/0x12c0
 lock_acquire+0xae/0x220
 down_read+0x38/0x60
 __do_page_fault+0x482/0x510
 do_page_fault+0x26/0x290
 common_exception+0x64/0x6a


---
 fs/f2fs/file.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index e67f03546391..0ce1e82591d1 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -53,8 +53,10 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
struct dnode_of_data dn;
int err;

-   if (unlikely(f2fs_cp_error(sbi)))
-   return -EIO;
+   if (unlikely(f2fs_cp_error(sbi))) {
+   err = -EIO;
+   goto out;
+   }

sb_start_pagefault(inode->i_sb);

@@ -66,7 +68,7 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
err = f2fs_reserve_block(, page->index);
if (err) {
f2fs_unlock_op(sbi);
-   goto out;
+   goto out_end;
}
f2fs_put_dnode();
f2fs_unlock_op(sbi);
@@ -114,9 +116,10 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)

 out_sem:
up_read(_I(inode)->i_mmap_sem);
-out:
+out_end:
sb_end_pagefault(inode->i_sb);
f2fs_update_time(sbi, REQ_TIME);
+out:
return block_page_mkwrite_return(err);
 }

-- 


Thanks,

>>
>> Signed-off-by: Jaegeuk Kim 
> 
> Reviewed-by: Chao Yu 
> 
> Thanks,
> 
>> ---
>>  fs/f2fs/acl.c|  3 +++
>>  fs/f2fs/checkpoint.c |  1 -
>>  fs/f2fs/file.c   | 23 +++
>>  fs/f2fs/namei.c  | 30 ++
>>  fs/f2fs/super.c  |  3 +++
>>  5 files changed, 59 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
>> index f6471f9d707e..a9bf5151e7c2 100644
>> --- a/fs/f2fs/acl.c
>> +++ b/fs/f2fs/acl.c
>> @@ -254,6 +254,9 @@ static int __f2fs_set_acl(struct inode *inode, int type,
>>  
>>  int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>>  {
>> +if (unlikely(f2fs_cp_error(F2FS_I_SB(inode
>> +return -EIO;
>> +
>>  return __f2fs_set_acl(inode, type, acl, NULL);
>>  }
>>  
>> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
>> index 201608281681..6b52d4b66c7b 100644
>> --- a/fs/f2fs/checkpoint.c
>> +++ b/fs/f2fs/checkpoint.c
>> @@ -29,7 +29,6 @@ struct kmem_cache *inode_entry_slab;
>>  void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
>>  {
>>  set_ckpt_flags(sbi, CP_ERROR_FLAG);
>> -sbi->sb->s_flags |= MS_RDONLY;
>>  if (!end_io)
>>  f2fs_flush_merged_writes(sbi);
>>  }
>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>> index 56232a72d2a3..0e09b9f02dc5 100644
>> --- a/fs/f2fs/file.c
>> +++ b/fs/f2fs/file.c
>> @@ -53,6 +53,9 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>>  struct dnode_of_data dn;
>>  int err;
>>  
>> +if (unlikely(f2fs_cp_error(sbi)))
>> +return -EIO;
>> +
>>  sb_start_pagefault(inode->i_sb);
>>  
>>  f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
>> @@ -310,6 +313,8 @@ static int f2fs_do_sync_file(struct file *file, loff_t 
>> start, loff_t end,
>>  
>>  int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int 
>> datasync)
>>  {
>> +if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file)
>> +return -EIO;
>>  return f2fs_do_sync_file(file, start, end, datasync, false);
>>  }
>>  
>> @@ -446,6 +451,9 @@ static int f2fs_file_mmap(struct file *file, struct 
>> vm_area_struct *vma)
>>  struct inode *inode = file_inode(file);
>>  int err;
>>  
>> +if (unlikely(f2fs_cp_error(F2FS_I_SB(inode
>> +return -EIO;
>> +
>>  /* we don't need to use inline_data strictly */
>>  err = f2fs_convert_inline_inode(inode);
>>  if (err)
>> 

Re: [f2fs-dev] [PATCH 2/2] f2fs: stop all the operations by cp_error flag

2017-11-04 Thread Chao Yu
Hi Jaegeuk,

On 2017/10/24 17:51, Chao Yu wrote:
> On 2017/10/24 6:14, Jaegeuk Kim wrote:
>> This patch replaces to use cp_error flag instead of RDONLY for quota off.

We should convert error number with block_page_mkwrite_return in .page_mkwrite,
otherwise generic/019 will cause a deadlock issue with below kernel message
printed:


WARNING: possible recursive locking detected
4.14.0-rc1 #35 Tainted: GW  O

fio/5845 is trying to acquire lock:
 (>mmap_sem){}, at: [] __do_page_fault+0x482/0x510

but task is already holding lock:
 (>mmap_sem){}, at: [] __do_page_fault+0x11e/0x510

other info that might help us debug this:
 Possible unsafe locking scenario:

   CPU0
   
  lock(>mmap_sem);
  lock(>mmap_sem);

 *** DEADLOCK ***

 May be due to missing lock nesting notation

1 lock held by fio/5845:
 #0:  (>mmap_sem){}, at: [] __do_page_fault+0x11e/0x510

stack backtrace:
CPU: 3 PID: 5845 Comm: fio Tainted: GW  O4.14.0-rc1 #35
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
Call Trace:
 dump_stack+0x5f/0x92
 __lock_acquire+0x1019/0x12c0
 lock_acquire+0xae/0x220
 down_read+0x38/0x60
 __do_page_fault+0x482/0x510
 do_page_fault+0x26/0x290
 common_exception+0x64/0x6a


---
 fs/f2fs/file.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index e67f03546391..0ce1e82591d1 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -53,8 +53,10 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
struct dnode_of_data dn;
int err;

-   if (unlikely(f2fs_cp_error(sbi)))
-   return -EIO;
+   if (unlikely(f2fs_cp_error(sbi))) {
+   err = -EIO;
+   goto out;
+   }

sb_start_pagefault(inode->i_sb);

@@ -66,7 +68,7 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
err = f2fs_reserve_block(, page->index);
if (err) {
f2fs_unlock_op(sbi);
-   goto out;
+   goto out_end;
}
f2fs_put_dnode();
f2fs_unlock_op(sbi);
@@ -114,9 +116,10 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)

 out_sem:
up_read(_I(inode)->i_mmap_sem);
-out:
+out_end:
sb_end_pagefault(inode->i_sb);
f2fs_update_time(sbi, REQ_TIME);
+out:
return block_page_mkwrite_return(err);
 }

-- 


Thanks,

>>
>> Signed-off-by: Jaegeuk Kim 
> 
> Reviewed-by: Chao Yu 
> 
> Thanks,
> 
>> ---
>>  fs/f2fs/acl.c|  3 +++
>>  fs/f2fs/checkpoint.c |  1 -
>>  fs/f2fs/file.c   | 23 +++
>>  fs/f2fs/namei.c  | 30 ++
>>  fs/f2fs/super.c  |  3 +++
>>  5 files changed, 59 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
>> index f6471f9d707e..a9bf5151e7c2 100644
>> --- a/fs/f2fs/acl.c
>> +++ b/fs/f2fs/acl.c
>> @@ -254,6 +254,9 @@ static int __f2fs_set_acl(struct inode *inode, int type,
>>  
>>  int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>>  {
>> +if (unlikely(f2fs_cp_error(F2FS_I_SB(inode
>> +return -EIO;
>> +
>>  return __f2fs_set_acl(inode, type, acl, NULL);
>>  }
>>  
>> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
>> index 201608281681..6b52d4b66c7b 100644
>> --- a/fs/f2fs/checkpoint.c
>> +++ b/fs/f2fs/checkpoint.c
>> @@ -29,7 +29,6 @@ struct kmem_cache *inode_entry_slab;
>>  void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
>>  {
>>  set_ckpt_flags(sbi, CP_ERROR_FLAG);
>> -sbi->sb->s_flags |= MS_RDONLY;
>>  if (!end_io)
>>  f2fs_flush_merged_writes(sbi);
>>  }
>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>> index 56232a72d2a3..0e09b9f02dc5 100644
>> --- a/fs/f2fs/file.c
>> +++ b/fs/f2fs/file.c
>> @@ -53,6 +53,9 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
>>  struct dnode_of_data dn;
>>  int err;
>>  
>> +if (unlikely(f2fs_cp_error(sbi)))
>> +return -EIO;
>> +
>>  sb_start_pagefault(inode->i_sb);
>>  
>>  f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
>> @@ -310,6 +313,8 @@ static int f2fs_do_sync_file(struct file *file, loff_t 
>> start, loff_t end,
>>  
>>  int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int 
>> datasync)
>>  {
>> +if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file)
>> +return -EIO;
>>  return f2fs_do_sync_file(file, start, end, datasync, false);
>>  }
>>  
>> @@ -446,6 +451,9 @@ static int f2fs_file_mmap(struct file *file, struct 
>> vm_area_struct *vma)
>>  struct inode *inode = file_inode(file);
>>  int err;
>>  
>> +if (unlikely(f2fs_cp_error(F2FS_I_SB(inode
>> +return -EIO;
>> +
>>  /* we don't need to use inline_data strictly */
>>  err = f2fs_convert_inline_inode(inode);
>>  if (err)
>> @@ -632,6 +640,9 @@ int 

[PATCH 2/2] staging: greybus: loopback: convert loopback to use generic async operations

2017-11-04 Thread Bryan O'Donoghue
Loopback has its own internal method for tracking and timing out
asynchronous operations however previous patches make it possible to use
functionality provided by operation.c to do this instead. Using the code in
operation.c means we can completely subtract the timer, the work-queue, the
kref and the cringe-worthy 'pending' flag. The completion callback
triggered by operation.c will provide an authoritative result code -
including -ETIMEDOUT for asynchronous operations.

Signed-off-by: Bryan O'Donoghue 
Cc: Johan Hovold 
Cc: Alex Elder 
Cc: Greg Kroah-Hartman 
Cc: Kees Cook 
Cc: greybus-...@lists.linaro.org
Cc: de...@driverdev.osuosl.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/staging/greybus/loopback.c | 165 +++--
 1 file changed, 31 insertions(+), 134 deletions(-)

diff --git a/drivers/staging/greybus/loopback.c 
b/drivers/staging/greybus/loopback.c
index 3d92638..48599ed 100644
--- a/drivers/staging/greybus/loopback.c
+++ b/drivers/staging/greybus/loopback.c
@@ -59,11 +59,6 @@ struct gb_loopback_async_operation {
struct gb_loopback *gb;
struct gb_operation *operation;
ktime_t ts;
-   struct timer_list timer;
-   struct list_head entry;
-   struct work_struct work;
-   struct kref kref;
-   bool pending;
int (*completion)(struct gb_loopback_async_operation *op_async);
 };
 
@@ -427,56 +422,6 @@ static int gb_loopback_operation_sync(struct gb_loopback 
*gb, int type,
return ret;
 }
 
-static void __gb_loopback_async_operation_destroy(struct kref *kref)
-{
-   struct gb_loopback_async_operation *op_async;
-
-   op_async = container_of(kref, struct gb_loopback_async_operation, kref);
-
-   list_del(_async->entry);
-   if (op_async->operation)
-   gb_operation_put(op_async->operation);
-   atomic_dec(_async->gb->outstanding_operations);
-   wake_up(_async->gb->wq_completion);
-   kfree(op_async);
-}
-
-static void gb_loopback_async_operation_get(struct gb_loopback_async_operation
-   *op_async)
-{
-   kref_get(_async->kref);
-}
-
-static void gb_loopback_async_operation_put(struct gb_loopback_async_operation
-   *op_async)
-{
-   unsigned long flags;
-
-   spin_lock_irqsave(_dev.lock, flags);
-   kref_put(_async->kref, __gb_loopback_async_operation_destroy);
-   spin_unlock_irqrestore(_dev.lock, flags);
-}
-
-static struct gb_loopback_async_operation *
-   gb_loopback_operation_find(u16 id)
-{
-   struct gb_loopback_async_operation *op_async;
-   bool found = false;
-   unsigned long flags;
-
-   spin_lock_irqsave(_dev.lock, flags);
-   list_for_each_entry(op_async, _dev.list_op_async, entry) {
-   if (op_async->operation->id == id) {
-   gb_loopback_async_operation_get(op_async);
-   found = true;
-   break;
-   }
-   }
-   spin_unlock_irqrestore(_dev.lock, flags);
-
-   return found ? op_async : NULL;
-}
-
 static void gb_loopback_async_wait_all(struct gb_loopback *gb)
 {
wait_event(gb->wq_completion,
@@ -488,83 +433,41 @@ static void gb_loopback_async_operation_callback(struct 
gb_operation *operation)
struct gb_loopback_async_operation *op_async;
struct gb_loopback *gb;
ktime_t te;
-   bool err = false;
+   int result;
 
te = ktime_get();
-   op_async = gb_loopback_operation_find(operation->id);
-   if (!op_async)
-   return;
-
+   result = gb_operation_result(operation);
+   op_async = gb_operation_get_data(operation);
gb = op_async->gb;
+
mutex_lock(>mutex);
 
-   if (!op_async->pending || gb_operation_result(operation)) {
-   err = true;
-   } else {
-   if (op_async->completion)
-   if (op_async->completion(op_async))
-   err = true;
-   }
+   if (!result && op_async->completion)
+   result = op_async->completion(op_async);
 
-   if (!err)
+   if (!result) {
gb->elapsed_nsecs = gb_loopback_calc_latency(op_async->ts, te);
-
-   if (op_async->pending) {
-   if (err)
-   gb->error++;
-   gb->iteration_count++;
-   op_async->pending = false;
-   del_timer_sync(_async->timer);
-   gb_loopback_async_operation_put(op_async);
-   gb_loopback_calculate_stats(gb, err);
+   } else {
+   gb->error++;
+   if (result == -ETIMEDOUT)
+   gb->requests_timedout++;
}
-   mutex_unlock(>mutex);
-
-   dev_dbg(>connection->bundle->dev, "complete operation %d\n",
-   

[PATCH 2/2] staging: greybus: loopback: convert loopback to use generic async operations

2017-11-04 Thread Bryan O'Donoghue
Loopback has its own internal method for tracking and timing out
asynchronous operations however previous patches make it possible to use
functionality provided by operation.c to do this instead. Using the code in
operation.c means we can completely subtract the timer, the work-queue, the
kref and the cringe-worthy 'pending' flag. The completion callback
triggered by operation.c will provide an authoritative result code -
including -ETIMEDOUT for asynchronous operations.

Signed-off-by: Bryan O'Donoghue 
Cc: Johan Hovold 
Cc: Alex Elder 
Cc: Greg Kroah-Hartman 
Cc: Kees Cook 
Cc: greybus-...@lists.linaro.org
Cc: de...@driverdev.osuosl.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/staging/greybus/loopback.c | 165 +++--
 1 file changed, 31 insertions(+), 134 deletions(-)

diff --git a/drivers/staging/greybus/loopback.c 
b/drivers/staging/greybus/loopback.c
index 3d92638..48599ed 100644
--- a/drivers/staging/greybus/loopback.c
+++ b/drivers/staging/greybus/loopback.c
@@ -59,11 +59,6 @@ struct gb_loopback_async_operation {
struct gb_loopback *gb;
struct gb_operation *operation;
ktime_t ts;
-   struct timer_list timer;
-   struct list_head entry;
-   struct work_struct work;
-   struct kref kref;
-   bool pending;
int (*completion)(struct gb_loopback_async_operation *op_async);
 };
 
@@ -427,56 +422,6 @@ static int gb_loopback_operation_sync(struct gb_loopback 
*gb, int type,
return ret;
 }
 
-static void __gb_loopback_async_operation_destroy(struct kref *kref)
-{
-   struct gb_loopback_async_operation *op_async;
-
-   op_async = container_of(kref, struct gb_loopback_async_operation, kref);
-
-   list_del(_async->entry);
-   if (op_async->operation)
-   gb_operation_put(op_async->operation);
-   atomic_dec(_async->gb->outstanding_operations);
-   wake_up(_async->gb->wq_completion);
-   kfree(op_async);
-}
-
-static void gb_loopback_async_operation_get(struct gb_loopback_async_operation
-   *op_async)
-{
-   kref_get(_async->kref);
-}
-
-static void gb_loopback_async_operation_put(struct gb_loopback_async_operation
-   *op_async)
-{
-   unsigned long flags;
-
-   spin_lock_irqsave(_dev.lock, flags);
-   kref_put(_async->kref, __gb_loopback_async_operation_destroy);
-   spin_unlock_irqrestore(_dev.lock, flags);
-}
-
-static struct gb_loopback_async_operation *
-   gb_loopback_operation_find(u16 id)
-{
-   struct gb_loopback_async_operation *op_async;
-   bool found = false;
-   unsigned long flags;
-
-   spin_lock_irqsave(_dev.lock, flags);
-   list_for_each_entry(op_async, _dev.list_op_async, entry) {
-   if (op_async->operation->id == id) {
-   gb_loopback_async_operation_get(op_async);
-   found = true;
-   break;
-   }
-   }
-   spin_unlock_irqrestore(_dev.lock, flags);
-
-   return found ? op_async : NULL;
-}
-
 static void gb_loopback_async_wait_all(struct gb_loopback *gb)
 {
wait_event(gb->wq_completion,
@@ -488,83 +433,41 @@ static void gb_loopback_async_operation_callback(struct 
gb_operation *operation)
struct gb_loopback_async_operation *op_async;
struct gb_loopback *gb;
ktime_t te;
-   bool err = false;
+   int result;
 
te = ktime_get();
-   op_async = gb_loopback_operation_find(operation->id);
-   if (!op_async)
-   return;
-
+   result = gb_operation_result(operation);
+   op_async = gb_operation_get_data(operation);
gb = op_async->gb;
+
mutex_lock(>mutex);
 
-   if (!op_async->pending || gb_operation_result(operation)) {
-   err = true;
-   } else {
-   if (op_async->completion)
-   if (op_async->completion(op_async))
-   err = true;
-   }
+   if (!result && op_async->completion)
+   result = op_async->completion(op_async);
 
-   if (!err)
+   if (!result) {
gb->elapsed_nsecs = gb_loopback_calc_latency(op_async->ts, te);
-
-   if (op_async->pending) {
-   if (err)
-   gb->error++;
-   gb->iteration_count++;
-   op_async->pending = false;
-   del_timer_sync(_async->timer);
-   gb_loopback_async_operation_put(op_async);
-   gb_loopback_calculate_stats(gb, err);
+   } else {
+   gb->error++;
+   if (result == -ETIMEDOUT)
+   gb->requests_timedout++;
}
-   mutex_unlock(>mutex);
-
-   dev_dbg(>connection->bundle->dev, "complete operation %d\n",
-   operation->id);
-
-   gb_loopback_async_operation_put(op_async);
-}
-
-static void 

[PATCH 1/2] staging: greybus: operation: add private data with get/set accessors

2017-11-04 Thread Bryan O'Donoghue
Asynchronous operation completion handler's lives are made easier if there
is a generic pointer that can store private data associated with the
operation. This patch adds a pointer field to operation.h and get/set
methods to access that pointer.

Signed-off-by: Bryan O'Donoghue 
Cc: Johan Hovold 
Cc: Alex Elder 
Cc: Greg Kroah-Hartman 
Cc: greybus-...@lists.linaro.org
Cc: de...@driverdev.osuosl.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/staging/greybus/operation.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/staging/greybus/operation.h 
b/drivers/staging/greybus/operation.h
index 7529f01..bfec1e9 100644
--- a/drivers/staging/greybus/operation.h
+++ b/drivers/staging/greybus/operation.h
@@ -105,6 +105,8 @@ struct gb_operation {
 
int active;
struct list_headlinks;  /* connection->operations */
+
+   void*private;
 };
 
 static inline bool
@@ -206,6 +208,17 @@ static inline int gb_operation_unidirectional(struct 
gb_connection *connection,
request, request_size, GB_OPERATION_TIMEOUT_DEFAULT);
 }
 
+static inline void *gb_operation_get_data(struct gb_operation *operation)
+{
+   return operation->private;
+}
+
+static inline void gb_operation_set_data(struct gb_operation *operation,
+void *data)
+{
+   operation->private = data;
+}
+
 int gb_operation_init(void);
 void gb_operation_exit(void);
 
-- 
2.7.4



[PATCH 0/2] Convert greybus loopback to core async API

2017-11-04 Thread Bryan O'Donoghue
dbec27298b0d ('staging: greybus: operation: add generic timeout support')
gives the ability to remove lots of the asynchronous operation code in
loopback.

Kees is also doing a cleanup of timer code which for loopback will go away
when converting to the core API.

These two patches kill two birds with err, two stones (no aggression to
birds intended) namely:

- Converting over to the core asynchronous API
- Getting rid of the timer code in loopback which will unblock what Kees is
  doing.

Bryan O'Donoghue (2):
  staging: greybus: operation: add private data with get/set accessors
  staging: greybus: loopback: convert loopback to use generic async
operations

 drivers/staging/greybus/loopback.c  | 165 +++-
 drivers/staging/greybus/operation.h |  13 +++
 2 files changed, 44 insertions(+), 134 deletions(-)

-- 
2.7.4



[PATCH 1/2] staging: greybus: operation: add private data with get/set accessors

2017-11-04 Thread Bryan O'Donoghue
Asynchronous operation completion handler's lives are made easier if there
is a generic pointer that can store private data associated with the
operation. This patch adds a pointer field to operation.h and get/set
methods to access that pointer.

Signed-off-by: Bryan O'Donoghue 
Cc: Johan Hovold 
Cc: Alex Elder 
Cc: Greg Kroah-Hartman 
Cc: greybus-...@lists.linaro.org
Cc: de...@driverdev.osuosl.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/staging/greybus/operation.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/staging/greybus/operation.h 
b/drivers/staging/greybus/operation.h
index 7529f01..bfec1e9 100644
--- a/drivers/staging/greybus/operation.h
+++ b/drivers/staging/greybus/operation.h
@@ -105,6 +105,8 @@ struct gb_operation {
 
int active;
struct list_headlinks;  /* connection->operations */
+
+   void*private;
 };
 
 static inline bool
@@ -206,6 +208,17 @@ static inline int gb_operation_unidirectional(struct 
gb_connection *connection,
request, request_size, GB_OPERATION_TIMEOUT_DEFAULT);
 }
 
+static inline void *gb_operation_get_data(struct gb_operation *operation)
+{
+   return operation->private;
+}
+
+static inline void gb_operation_set_data(struct gb_operation *operation,
+void *data)
+{
+   operation->private = data;
+}
+
 int gb_operation_init(void);
 void gb_operation_exit(void);
 
-- 
2.7.4



[PATCH 0/2] Convert greybus loopback to core async API

2017-11-04 Thread Bryan O'Donoghue
dbec27298b0d ('staging: greybus: operation: add generic timeout support')
gives the ability to remove lots of the asynchronous operation code in
loopback.

Kees is also doing a cleanup of timer code which for loopback will go away
when converting to the core API.

These two patches kill two birds with err, two stones (no aggression to
birds intended) namely:

- Converting over to the core asynchronous API
- Getting rid of the timer code in loopback which will unblock what Kees is
  doing.

Bryan O'Donoghue (2):
  staging: greybus: operation: add private data with get/set accessors
  staging: greybus: loopback: convert loopback to use generic async
operations

 drivers/staging/greybus/loopback.c  | 165 +++-
 drivers/staging/greybus/operation.h |  13 +++
 2 files changed, 44 insertions(+), 134 deletions(-)

-- 
2.7.4



[PATCH v4 1/1] xdp: Sample xdp program implementing ip forward

2017-11-04 Thread Christina Jacob
From: Christina Jacob 

Implements port to port forwarding with route table and arp table
lookup for ipv4 packets using bpf_redirect helper function and
lpm_trie  map.

Signed-off-by: Christina Jacob 
---
 samples/bpf/Makefile   |   4 +
 samples/bpf/xdp_router_ipv4_kern.c | 186 +++
 samples/bpf/xdp_router_ipv4_user.c | 659 +
 3 files changed, 849 insertions(+)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index cf17c79..8504ebb 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -28,6 +28,7 @@ hostprogs-y += test_cgrp2_sock
 hostprogs-y += test_cgrp2_sock2
 hostprogs-y += xdp1
 hostprogs-y += xdp2
+hostprogs-y += xdp_router_ipv4
 hostprogs-y += test_current_task_under_cgroup
 hostprogs-y += trace_event
 hostprogs-y += sampleip
@@ -73,6 +74,7 @@ test_cgrp2_sock2-objs := bpf_load.o $(LIBBPF) 
test_cgrp2_sock2.o
 xdp1-objs := bpf_load.o $(LIBBPF) xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o
+xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o
 test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) cgroup_helpers.o \
   test_current_task_under_cgroup_user.o
 trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o
@@ -114,6 +116,7 @@ always += parse_varlen.o parse_simple.o parse_ldabs.o
 always += test_cgrp2_tc_kern.o
 always += xdp1_kern.o
 always += xdp2_kern.o
+always += xdp_router_ipv4_kern.o
 always += test_current_task_under_cgroup_kern.o
 always += trace_event_kern.o
 always += sampleip_kern.o
@@ -160,6 +163,7 @@ HOSTLOADLIBES_map_perf_test += -lelf -lrt
 HOSTLOADLIBES_test_overhead += -lelf -lrt
 HOSTLOADLIBES_xdp1 += -lelf
 HOSTLOADLIBES_xdp2 += -lelf
+HOSTLOADLIBES_xdp_router_ipv4 += -lelf
 HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
 HOSTLOADLIBES_trace_event += -lelf
 HOSTLOADLIBES_sampleip += -lelf
diff --git a/samples/bpf/xdp_router_ipv4_kern.c 
b/samples/bpf/xdp_router_ipv4_kern.c
new file mode 100644
index 000..993f56b
--- /dev/null
+++ b/samples/bpf/xdp_router_ipv4_kern.c
@@ -0,0 +1,186 @@
+/* Copyright (C) 2017 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include 
+#include 
+
+struct trie_value {
+   __u8 prefix[4];
+   __be64 value;
+   int ifindex;
+   int metric;
+   __be32 gw;
+};
+
+/* Key for lpm_trie*/
+union key_4 {
+   u32 b32[2];
+   u8 b8[8];
+};
+
+struct arp_entry {
+   __be64 mac;
+   __be32 dst;
+};
+
+struct direct_map {
+   struct arp_entry arp;
+   int ifindex;
+   __be64 mac;
+};
+
+/* Map for trie implementation*/
+struct bpf_map_def SEC("maps") lpm_map = {
+   .type = BPF_MAP_TYPE_LPM_TRIE,
+   .key_size = 8,
+   .value_size = sizeof(struct trie_value),
+   .max_entries = 50,
+   .map_flags = BPF_F_NO_PREALLOC,
+};
+
+/* Map for counter*/
+struct bpf_map_def SEC("maps") rxcnt = {
+   .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+   .key_size = sizeof(u32),
+   .value_size = sizeof(u64),
+   .max_entries = 256,
+};
+
+/* Map for ARP table*/
+struct bpf_map_def SEC("maps") arp_table = {
+   .type = BPF_MAP_TYPE_HASH,
+   .key_size = sizeof(__be32),
+   .value_size = sizeof(__be64),
+   .max_entries = 50,
+};
+
+/* Map to keep the exact match entries in the route table*/
+struct bpf_map_def SEC("maps") exact_match = {
+   .type = BPF_MAP_TYPE_HASH,
+   .key_size = sizeof(__be32),
+   .value_size = sizeof(struct direct_map),
+   .max_entries = 50,
+};
+
+struct bpf_map_def SEC("maps") tx_port = {
+   .type = BPF_MAP_TYPE_DEVMAP,
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .max_entries = 100,
+};
+
+/* Function to set source and destination mac of the packet */
+static inline void set_src_dst_mac(void *data, void *src, void *dst)
+{
+   unsigned short *source = src;
+   unsigned short *dest  = dst;
+   unsigned short *p = data;
+
+   __builtin_memcpy(p, dest, 6);
+   __builtin_memcpy(p + 3, source, 6);
+}
+
+/* Parse IPV4 packet to get SRC, DST IP and protocol */
+static inline int parse_ipv4(void *data, u64 nh_off, void *data_end,
+__be32 *src, __be32 *dest)
+{
+   struct iphdr *iph = data + nh_off;
+
+   if (iph + 1 > data_end)
+   return 0;
+   *src = iph->saddr;
+   *dest = iph->daddr;
+   return iph->protocol;
+}
+
+SEC("xdp_router_ipv4")
+int xdp_router_ipv4_prog(struct xdp_md *ctx)
+{
+   void *data_end = (void *)(long)ctx->data_end;
+   __be64 *dest_mac = NULL, *src_mac = NULL;
+   

[PATCH v4 0/1] XDP program for ip forward

2017-11-04 Thread Christina Jacob
From: Christina Jacob 

The patch below implements port to port forwarding through route table and arp
table lookup for ipv4 packets using bpf_redirect helper function and lpm_trie
map.  This has an improved performance over the normal kernel stack ip forward.

Implementation details.
---
The program uses one map each for arp table, route table and packet count.
The number of entries the program can process is limited by the size of the
map used.

In the xdp_router_ipv4_user.c,

initially, the routing table is read and is stored in an lpm trie map.
The arp table is read and stored in an array map There are two netlink sockets
that listens to any change in the route table  and arp table.
There are two types of changes to the route table.
1.New

The new entries are added to the lpm_trie with proper key and prefix
length If there is a another entry in the route table with a different
metric(only metric is considered). Then the values are compared and the
one with lowest metric is added to the node.

2.Deletion

On deletion from the route table, The particular node is removed and the
entire route table is again read to check if there is another entry with
a different metric.

This implementation depends on  bpf: Implement map_delete_elem for
BPF_MAP_TYPE_LPM_TRIE which is not yet upstreamed.

There are two types of changes to the route table

1.New

The new arp entries are added in the in the array map directly with the
ip address as the key and the destination mac address as the value.

2.Delete

The entry corresponding to the particular ip is deleted from the
arp table map.

Another map is maintained for entries in the route table having 32 bit mask.
such entries can have a corresponding  arp entry which if  stored together with
the route entry in an array map and can be accessed in O(1) time. Eliminating
the trie lookup and arp lookup.

In the xdp_router_ipv4_kern.c,

The array map for the 32 bit mask entries checked to see if there is a key that
exactly matches with the destination ip. If it has a non zero destination mac
entry then the xdp data is updated accordingly Otherwise a proper route and
arp table lookup is done using the lpm_trie and the arp table array map.

Usage: as ./xdp_router_ipv4 -S  (-S for
generic xdp implementation ifindex- the index of the interface to which
the xdp program has to be attached.) in 4.14-rc3 kernel.

Changes from v1 to v2
-

* As suggested by Jesper Dangaard Brouer
1. Changed the program name to  list xdp_router_ipv4
2. Changed the commandline arguments from ifindex list to interface name
Usage : ./xdp_router_ipv4 [-S] 
-S for generic xdp implementation
-interface name list is the list of interfaces to which
the xdp program should attach to

* As suggested by Daniel Borkmann
1. Using __builin_memcpy to update source and destination mac in the bpf
  kernel program.

2. Started using __be32 in the kernel program to be inline with the data
   type used in user program

3. Rectified few style issues.

* Corrected the copyright issue pointed out by David Ahern

* Fixed the bug: The already attached interfaces are not detached from the
  xdp program if the program fails to attach to an interface later in the list.


Changes from v2 to v3
-
* As pointed out by Jesper Dangaard Brouer
   1. Changed the program name in the cover letter.
   2. Changed variable declararions to follow Reverse-xmas tree
  rule.
   3. Reduced the nesting in code for readability.
   4. Fixed bug: incorrect mac address being set for source and
  destination mac.
   5. Fixed comment style.

* As suggested by Stephen Hemminger 
Changed all the bzeros' to memset.

* As suggested by David Laight
removed the signed remainders calculation.

* As suggested by Stephen Hemminger and David Daney 
1. Added checks for the ioctl return value.
2. Changed data types to be64 to be sure about the size of the
   data type.
3. Verified byte order. Using the mac address from ioctl in
   network byte order. not casting to to long data type
   anymore.
4. Fixed returning address of local variable.

Changes from v3 to v4
-
* As suggested by Jesper,
1. Removed redundant typecastings.
2. Modified program to use bpf_redirect_map for better
   performance.
3. Changed program name in the code as well.


Christina Jacob (1):
  xdp: Sample xdp program implementing ip 

[PATCH v4 0/1] XDP program for ip forward

2017-11-04 Thread Christina Jacob
From: Christina Jacob 

The patch below implements port to port forwarding through route table and arp
table lookup for ipv4 packets using bpf_redirect helper function and lpm_trie
map.  This has an improved performance over the normal kernel stack ip forward.

Implementation details.
---
The program uses one map each for arp table, route table and packet count.
The number of entries the program can process is limited by the size of the
map used.

In the xdp_router_ipv4_user.c,

initially, the routing table is read and is stored in an lpm trie map.
The arp table is read and stored in an array map There are two netlink sockets
that listens to any change in the route table  and arp table.
There are two types of changes to the route table.
1.New

The new entries are added to the lpm_trie with proper key and prefix
length If there is a another entry in the route table with a different
metric(only metric is considered). Then the values are compared and the
one with lowest metric is added to the node.

2.Deletion

On deletion from the route table, The particular node is removed and the
entire route table is again read to check if there is another entry with
a different metric.

This implementation depends on  bpf: Implement map_delete_elem for
BPF_MAP_TYPE_LPM_TRIE which is not yet upstreamed.

There are two types of changes to the route table

1.New

The new arp entries are added in the in the array map directly with the
ip address as the key and the destination mac address as the value.

2.Delete

The entry corresponding to the particular ip is deleted from the
arp table map.

Another map is maintained for entries in the route table having 32 bit mask.
such entries can have a corresponding  arp entry which if  stored together with
the route entry in an array map and can be accessed in O(1) time. Eliminating
the trie lookup and arp lookup.

In the xdp_router_ipv4_kern.c,

The array map for the 32 bit mask entries checked to see if there is a key that
exactly matches with the destination ip. If it has a non zero destination mac
entry then the xdp data is updated accordingly Otherwise a proper route and
arp table lookup is done using the lpm_trie and the arp table array map.

Usage: as ./xdp_router_ipv4 -S  (-S for
generic xdp implementation ifindex- the index of the interface to which
the xdp program has to be attached.) in 4.14-rc3 kernel.

Changes from v1 to v2
-

* As suggested by Jesper Dangaard Brouer
1. Changed the program name to  list xdp_router_ipv4
2. Changed the commandline arguments from ifindex list to interface name
Usage : ./xdp_router_ipv4 [-S] 
-S for generic xdp implementation
-interface name list is the list of interfaces to which
the xdp program should attach to

* As suggested by Daniel Borkmann
1. Using __builin_memcpy to update source and destination mac in the bpf
  kernel program.

2. Started using __be32 in the kernel program to be inline with the data
   type used in user program

3. Rectified few style issues.

* Corrected the copyright issue pointed out by David Ahern

* Fixed the bug: The already attached interfaces are not detached from the
  xdp program if the program fails to attach to an interface later in the list.


Changes from v2 to v3
-
* As pointed out by Jesper Dangaard Brouer
   1. Changed the program name in the cover letter.
   2. Changed variable declararions to follow Reverse-xmas tree
  rule.
   3. Reduced the nesting in code for readability.
   4. Fixed bug: incorrect mac address being set for source and
  destination mac.
   5. Fixed comment style.

* As suggested by Stephen Hemminger 
Changed all the bzeros' to memset.

* As suggested by David Laight
removed the signed remainders calculation.

* As suggested by Stephen Hemminger and David Daney 
1. Added checks for the ioctl return value.
2. Changed data types to be64 to be sure about the size of the
   data type.
3. Verified byte order. Using the mac address from ioctl in
   network byte order. not casting to to long data type
   anymore.
4. Fixed returning address of local variable.

Changes from v3 to v4
-
* As suggested by Jesper,
1. Removed redundant typecastings.
2. Modified program to use bpf_redirect_map for better
   performance.
3. Changed program name in the code as well.


Christina Jacob (1):
  xdp: Sample xdp program implementing ip forward

 samples/bpf/Makefile 

[PATCH v4 1/1] xdp: Sample xdp program implementing ip forward

2017-11-04 Thread Christina Jacob
From: Christina Jacob 

Implements port to port forwarding with route table and arp table
lookup for ipv4 packets using bpf_redirect helper function and
lpm_trie  map.

Signed-off-by: Christina Jacob 
---
 samples/bpf/Makefile   |   4 +
 samples/bpf/xdp_router_ipv4_kern.c | 186 +++
 samples/bpf/xdp_router_ipv4_user.c | 659 +
 3 files changed, 849 insertions(+)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index cf17c79..8504ebb 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -28,6 +28,7 @@ hostprogs-y += test_cgrp2_sock
 hostprogs-y += test_cgrp2_sock2
 hostprogs-y += xdp1
 hostprogs-y += xdp2
+hostprogs-y += xdp_router_ipv4
 hostprogs-y += test_current_task_under_cgroup
 hostprogs-y += trace_event
 hostprogs-y += sampleip
@@ -73,6 +74,7 @@ test_cgrp2_sock2-objs := bpf_load.o $(LIBBPF) 
test_cgrp2_sock2.o
 xdp1-objs := bpf_load.o $(LIBBPF) xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o
+xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o
 test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) cgroup_helpers.o \
   test_current_task_under_cgroup_user.o
 trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o
@@ -114,6 +116,7 @@ always += parse_varlen.o parse_simple.o parse_ldabs.o
 always += test_cgrp2_tc_kern.o
 always += xdp1_kern.o
 always += xdp2_kern.o
+always += xdp_router_ipv4_kern.o
 always += test_current_task_under_cgroup_kern.o
 always += trace_event_kern.o
 always += sampleip_kern.o
@@ -160,6 +163,7 @@ HOSTLOADLIBES_map_perf_test += -lelf -lrt
 HOSTLOADLIBES_test_overhead += -lelf -lrt
 HOSTLOADLIBES_xdp1 += -lelf
 HOSTLOADLIBES_xdp2 += -lelf
+HOSTLOADLIBES_xdp_router_ipv4 += -lelf
 HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
 HOSTLOADLIBES_trace_event += -lelf
 HOSTLOADLIBES_sampleip += -lelf
diff --git a/samples/bpf/xdp_router_ipv4_kern.c 
b/samples/bpf/xdp_router_ipv4_kern.c
new file mode 100644
index 000..993f56b
--- /dev/null
+++ b/samples/bpf/xdp_router_ipv4_kern.c
@@ -0,0 +1,186 @@
+/* Copyright (C) 2017 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include 
+#include 
+
+struct trie_value {
+   __u8 prefix[4];
+   __be64 value;
+   int ifindex;
+   int metric;
+   __be32 gw;
+};
+
+/* Key for lpm_trie*/
+union key_4 {
+   u32 b32[2];
+   u8 b8[8];
+};
+
+struct arp_entry {
+   __be64 mac;
+   __be32 dst;
+};
+
+struct direct_map {
+   struct arp_entry arp;
+   int ifindex;
+   __be64 mac;
+};
+
+/* Map for trie implementation*/
+struct bpf_map_def SEC("maps") lpm_map = {
+   .type = BPF_MAP_TYPE_LPM_TRIE,
+   .key_size = 8,
+   .value_size = sizeof(struct trie_value),
+   .max_entries = 50,
+   .map_flags = BPF_F_NO_PREALLOC,
+};
+
+/* Map for counter*/
+struct bpf_map_def SEC("maps") rxcnt = {
+   .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+   .key_size = sizeof(u32),
+   .value_size = sizeof(u64),
+   .max_entries = 256,
+};
+
+/* Map for ARP table*/
+struct bpf_map_def SEC("maps") arp_table = {
+   .type = BPF_MAP_TYPE_HASH,
+   .key_size = sizeof(__be32),
+   .value_size = sizeof(__be64),
+   .max_entries = 50,
+};
+
+/* Map to keep the exact match entries in the route table*/
+struct bpf_map_def SEC("maps") exact_match = {
+   .type = BPF_MAP_TYPE_HASH,
+   .key_size = sizeof(__be32),
+   .value_size = sizeof(struct direct_map),
+   .max_entries = 50,
+};
+
+struct bpf_map_def SEC("maps") tx_port = {
+   .type = BPF_MAP_TYPE_DEVMAP,
+   .key_size = sizeof(int),
+   .value_size = sizeof(int),
+   .max_entries = 100,
+};
+
+/* Function to set source and destination mac of the packet */
+static inline void set_src_dst_mac(void *data, void *src, void *dst)
+{
+   unsigned short *source = src;
+   unsigned short *dest  = dst;
+   unsigned short *p = data;
+
+   __builtin_memcpy(p, dest, 6);
+   __builtin_memcpy(p + 3, source, 6);
+}
+
+/* Parse IPV4 packet to get SRC, DST IP and protocol */
+static inline int parse_ipv4(void *data, u64 nh_off, void *data_end,
+__be32 *src, __be32 *dest)
+{
+   struct iphdr *iph = data + nh_off;
+
+   if (iph + 1 > data_end)
+   return 0;
+   *src = iph->saddr;
+   *dest = iph->daddr;
+   return iph->protocol;
+}
+
+SEC("xdp_router_ipv4")
+int xdp_router_ipv4_prog(struct xdp_md *ctx)
+{
+   void *data_end = (void *)(long)ctx->data_end;
+   __be64 *dest_mac = NULL, *src_mac = NULL;
+   void *data = (void *)(long)ctx->data;
+   struct 

Re: [PATCH] kbuild: Set KBUILD_CFLAGS before incl. arch Makefile

2017-11-04 Thread Masahiro Yamada
2017-11-03 6:26 GMT+09:00 Nick Desaulniers :
> From: Chris Fries 
>
> Set the clang KBUILD_CFLAGS up before including arch/ Makefiles,
> so that ld-options (etc.) can work correctly.


ld-option is only used for arch/{arm64,powerpc}/Makefile

arch/arm64/Makefile:  ifeq ($(call ld-option, --fix-cortex-a53-843419),)
arch/powerpc/Makefile:LDFLAGS_vmlinux += $(call
ld-option,--orphan-handling=warn)



I think this patch makes sense when it comes along with
https://patchwork.kernel.org/patch/10030581/

but, it is now being blocked by 0-day bot
due to a x86 problem.





> This fixes errors with clang such as ld-options trying to CC
> against your host architecture, but LD trying to link against
> your target architecture.
>
> We didn't notice this problem on Android, because we took the original
> LLVMLinux patch into our 4.4 kernels, which did not have this issue. We
> ran into this taking the proper upstream patch on newer kernel versions.
> The original LLVMLinux patch can be seen at:
>
> http://git.linuxfoundation.org/?p=llvmlinux/kernel.git;a=blobdiff;f=Makefile;h=389006c4ef494cda3a1ee52bf355618673ab4f31;hp=e41a3356abee83f08288362950bfceebd25ec3c2;hb=ef9126da11b18ff34eb1f01561f53c378860336c;hpb=f800c25b7a762d445ba1439a2428c8362157eba6
>
> It seems that when the patch was re-upstreamed, a V2 was requested that
> moved the definition of Clang's target triple to be later in the top
> level Makefile than the inclusion of the arch specific Makefile,
> breaking macros like ld-option when cross compiling. V2 was requested
> at:

But, ld-option is defines as follows in llvm-linux tree (and mainline too):

ld-option = $(call try-run,\
$(CC) -x c /dev/null -c -o "$$TMPO" ; $(LD) $(1) "$$TMPO" -o
"$$TMP",$(1),$(2))


ld-option does not depend on any pre-defined flags.


The location of CLANG_GCC_TC define
only matters after your patch is applied, right?

Did my request for v2 break anything?


One more thing: this patch does not apply to kbuild tree.



> https://lkml.org/lkml/2017/4/21/116
>
> Signed-off-by: Chris Fries 
> Signed-off-by: Nick Desaulniers 
> ---
>  Makefile | 64 
> 
>  1 file changed, 32 insertions(+), 32 deletions(-)
>
> diff --git a/Makefile b/Makefile
> index 5f91a28a3cea..72ea86157114 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -512,6 +512,38 @@ ifneq ($(filter install,$(MAKECMDGOALS)),)
>  endif
>  endif
>
> +ifeq ($(cc-name),clang)
> +ifneq ($(CROSS_COMPILE),)
> +CLANG_TARGET   := --target=$(notdir $(CROSS_COMPILE:%-=%))
> +GCC_TOOLCHAIN  := $(realpath $(dir $(shell which $(LD)))/..)
> +endif
> +ifneq ($(GCC_TOOLCHAIN),)
> +CLANG_GCC_TC   := --gcc-toolchain=$(GCC_TOOLCHAIN)
> +endif
> +KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
> +KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
> +KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
> +KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
> +KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
> +KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
> +KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
> +# Quiet clang warning: comparison of unsigned expression < 0 is always false
> +KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
> +# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as 
> the
> +# source of a reference will be _MergedGlobals and not on of the whitelisted 
> names.
> +# See modpost pattern 2
> +KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
> +KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
> +KBUILD_CFLAGS += $(call cc-option, -no-integrated-as)
> +KBUILD_AFLAGS += $(call cc-option, -no-integrated-as)
> +else
> +
> +# These warnings generated too much noise in a regular build.
> +# Use make W=1 to enable them (see scripts/Makefile.extrawarn)
> +KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
> +KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
> +endif
> +
>  ifeq ($(mixed-targets),1)
>  # ===
>  # We're called with mixed targets (*config and build targets).
> @@ -695,38 +727,6 @@ ifdef CONFIG_CC_STACKPROTECTOR
>  endif
>  KBUILD_CFLAGS += $(stackp-flag)
>
> -ifeq ($(cc-name),clang)
> -ifneq ($(CROSS_COMPILE),)
> -CLANG_TARGET   := --target=$(notdir $(CROSS_COMPILE:%-=%))
> -GCC_TOOLCHAIN  := $(realpath $(dir $(shell which $(LD)))/..)
> -endif
> -ifneq ($(GCC_TOOLCHAIN),)
> -CLANG_GCC_TC   := --gcc-toolchain=$(GCC_TOOLCHAIN)
> -endif
> -KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
> -KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
> -KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
> -KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
> -KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
> 

Re: [PATCH] kbuild: Set KBUILD_CFLAGS before incl. arch Makefile

2017-11-04 Thread Masahiro Yamada
2017-11-03 6:26 GMT+09:00 Nick Desaulniers :
> From: Chris Fries 
>
> Set the clang KBUILD_CFLAGS up before including arch/ Makefiles,
> so that ld-options (etc.) can work correctly.


ld-option is only used for arch/{arm64,powerpc}/Makefile

arch/arm64/Makefile:  ifeq ($(call ld-option, --fix-cortex-a53-843419),)
arch/powerpc/Makefile:LDFLAGS_vmlinux += $(call
ld-option,--orphan-handling=warn)



I think this patch makes sense when it comes along with
https://patchwork.kernel.org/patch/10030581/

but, it is now being blocked by 0-day bot
due to a x86 problem.





> This fixes errors with clang such as ld-options trying to CC
> against your host architecture, but LD trying to link against
> your target architecture.
>
> We didn't notice this problem on Android, because we took the original
> LLVMLinux patch into our 4.4 kernels, which did not have this issue. We
> ran into this taking the proper upstream patch on newer kernel versions.
> The original LLVMLinux patch can be seen at:
>
> http://git.linuxfoundation.org/?p=llvmlinux/kernel.git;a=blobdiff;f=Makefile;h=389006c4ef494cda3a1ee52bf355618673ab4f31;hp=e41a3356abee83f08288362950bfceebd25ec3c2;hb=ef9126da11b18ff34eb1f01561f53c378860336c;hpb=f800c25b7a762d445ba1439a2428c8362157eba6
>
> It seems that when the patch was re-upstreamed, a V2 was requested that
> moved the definition of Clang's target triple to be later in the top
> level Makefile than the inclusion of the arch specific Makefile,
> breaking macros like ld-option when cross compiling. V2 was requested
> at:

But, ld-option is defines as follows in llvm-linux tree (and mainline too):

ld-option = $(call try-run,\
$(CC) -x c /dev/null -c -o "$$TMPO" ; $(LD) $(1) "$$TMPO" -o
"$$TMP",$(1),$(2))


ld-option does not depend on any pre-defined flags.


The location of CLANG_GCC_TC define
only matters after your patch is applied, right?

Did my request for v2 break anything?


One more thing: this patch does not apply to kbuild tree.



> https://lkml.org/lkml/2017/4/21/116
>
> Signed-off-by: Chris Fries 
> Signed-off-by: Nick Desaulniers 
> ---
>  Makefile | 64 
> 
>  1 file changed, 32 insertions(+), 32 deletions(-)
>
> diff --git a/Makefile b/Makefile
> index 5f91a28a3cea..72ea86157114 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -512,6 +512,38 @@ ifneq ($(filter install,$(MAKECMDGOALS)),)
>  endif
>  endif
>
> +ifeq ($(cc-name),clang)
> +ifneq ($(CROSS_COMPILE),)
> +CLANG_TARGET   := --target=$(notdir $(CROSS_COMPILE:%-=%))
> +GCC_TOOLCHAIN  := $(realpath $(dir $(shell which $(LD)))/..)
> +endif
> +ifneq ($(GCC_TOOLCHAIN),)
> +CLANG_GCC_TC   := --gcc-toolchain=$(GCC_TOOLCHAIN)
> +endif
> +KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
> +KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
> +KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
> +KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
> +KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
> +KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
> +KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
> +# Quiet clang warning: comparison of unsigned expression < 0 is always false
> +KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
> +# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as 
> the
> +# source of a reference will be _MergedGlobals and not on of the whitelisted 
> names.
> +# See modpost pattern 2
> +KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
> +KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
> +KBUILD_CFLAGS += $(call cc-option, -no-integrated-as)
> +KBUILD_AFLAGS += $(call cc-option, -no-integrated-as)
> +else
> +
> +# These warnings generated too much noise in a regular build.
> +# Use make W=1 to enable them (see scripts/Makefile.extrawarn)
> +KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
> +KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
> +endif
> +
>  ifeq ($(mixed-targets),1)
>  # ===
>  # We're called with mixed targets (*config and build targets).
> @@ -695,38 +727,6 @@ ifdef CONFIG_CC_STACKPROTECTOR
>  endif
>  KBUILD_CFLAGS += $(stackp-flag)
>
> -ifeq ($(cc-name),clang)
> -ifneq ($(CROSS_COMPILE),)
> -CLANG_TARGET   := --target=$(notdir $(CROSS_COMPILE:%-=%))
> -GCC_TOOLCHAIN  := $(realpath $(dir $(shell which $(LD)))/..)
> -endif
> -ifneq ($(GCC_TOOLCHAIN),)
> -CLANG_GCC_TC   := --gcc-toolchain=$(GCC_TOOLCHAIN)
> -endif
> -KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
> -KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
> -KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
> -KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
> -KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
> -KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
> -KBUILD_CFLAGS += $(call cc-disable-warning, 

Re: [RFC -mm] mm, userfaultfd, THP: Avoid waiting when PMD under THP migration

2017-11-04 Thread huang ying
On Fri, Nov 3, 2017 at 11:00 PM, Zi Yan  wrote:
> On 3 Nov 2017, at 3:52, Huang, Ying wrote:
>
>> From: Huang Ying 
>>
>> If THP migration is enabled, the following situation is possible,
>>
>> - A THP is mapped at source address
>> - Migration is started to move the THP to another node
>> - Page fault occurs
>> - The PMD (migration entry) is copied to the destination address in mremap
>>
>
> You mean the page fault path follows the source address and sees pmd_none() 
> now
> because mremap() clears it and remaps the page with dest address.
> Otherwise, it seems not possible to get into handle_userfault(), since it is 
> called in
> pmd_none() branch inside do_huge_pmd_anonymous_page().
>
>
>> That is, it is possible for handle_userfault() encounter a PMD entry
>> which has been handled but !pmd_present().  In the current
>> implementation, we will wait for such PMD entries, which may cause
>> unnecessary waiting, and potential soft lockup.
>
> handle_userfault() should only see pmd_none() in the situation you describe,
> whereas !pmd_present() (migration entry case) should lead to
> pmd_migration_entry_wait().

Yes.  This is my understanding of the source code too.  And I
described it in the original patch description too.  I just want to
make sure whether it is possible that !pmd_none() and !pmd_present()
for a PMD in userfaultfd_must_wait().  And, whether it is possible for
us to implement PMD mapping copying in UFFDIO_COPY in the future?

Best Regards,
Huang, Ying

> Am I missing anything here?
>
>
> --
> Best Regards
> Yan Zi


Re: [RFC -mm] mm, userfaultfd, THP: Avoid waiting when PMD under THP migration

2017-11-04 Thread huang ying
On Fri, Nov 3, 2017 at 11:00 PM, Zi Yan  wrote:
> On 3 Nov 2017, at 3:52, Huang, Ying wrote:
>
>> From: Huang Ying 
>>
>> If THP migration is enabled, the following situation is possible,
>>
>> - A THP is mapped at source address
>> - Migration is started to move the THP to another node
>> - Page fault occurs
>> - The PMD (migration entry) is copied to the destination address in mremap
>>
>
> You mean the page fault path follows the source address and sees pmd_none() 
> now
> because mremap() clears it and remaps the page with dest address.
> Otherwise, it seems not possible to get into handle_userfault(), since it is 
> called in
> pmd_none() branch inside do_huge_pmd_anonymous_page().
>
>
>> That is, it is possible for handle_userfault() encounter a PMD entry
>> which has been handled but !pmd_present().  In the current
>> implementation, we will wait for such PMD entries, which may cause
>> unnecessary waiting, and potential soft lockup.
>
> handle_userfault() should only see pmd_none() in the situation you describe,
> whereas !pmd_present() (migration entry case) should lead to
> pmd_migration_entry_wait().

Yes.  This is my understanding of the source code too.  And I
described it in the original patch description too.  I just want to
make sure whether it is possible that !pmd_none() and !pmd_present()
for a PMD in userfaultfd_must_wait().  And, whether it is possible for
us to implement PMD mapping copying in UFFDIO_COPY in the future?

Best Regards,
Huang, Ying

> Am I missing anything here?
>
>
> --
> Best Regards
> Yan Zi


[PATCH v3] scsi: require CAP_SYS_ADMIN to write to procfs interface

2017-11-04 Thread Aleksa Sarai
Previously, the only capability effectively required to operate on the
/proc/scsi interface was CAP_DAC_OVERRIDE (or for some other files,
having an fsuid of GLOBAL_ROOT_UID was enough). This means that
semi-privileged processes could interfere with core components of a
system (such as causing a DoS by removing the underlying SCSI device of
the host's / mount).

Cc: 
Cc: "Eric W. Biederman" 
Signed-off-by: Aleksa Sarai 
---
 drivers/scsi/scsi_proc.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c
index 480a597b3877..05d70e200c5f 100644
--- a/drivers/scsi/scsi_proc.c
+++ b/drivers/scsi/scsi_proc.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -51,7 +52,10 @@ static ssize_t proc_scsi_host_write(struct file *file, const 
char __user *buf,
struct Scsi_Host *shost = PDE_DATA(file_inode(file));
ssize_t ret = -ENOMEM;
char *page;
-
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
if (count > PROC_BLOCK_SIZE)
return -EOVERFLOW;
 
@@ -313,6 +317,9 @@ static ssize_t proc_scsi_write(struct file *file, const 
char __user *buf,
char *buffer, *p;
int err;
 
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
if (!buf || length > PAGE_SIZE)
return -EINVAL;
 
-- 
2.14.3



[PATCH v3] scsi: require CAP_SYS_ADMIN to write to procfs interface

2017-11-04 Thread Aleksa Sarai
Previously, the only capability effectively required to operate on the
/proc/scsi interface was CAP_DAC_OVERRIDE (or for some other files,
having an fsuid of GLOBAL_ROOT_UID was enough). This means that
semi-privileged processes could interfere with core components of a
system (such as causing a DoS by removing the underlying SCSI device of
the host's / mount).

Cc: 
Cc: "Eric W. Biederman" 
Signed-off-by: Aleksa Sarai 
---
 drivers/scsi/scsi_proc.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c
index 480a597b3877..05d70e200c5f 100644
--- a/drivers/scsi/scsi_proc.c
+++ b/drivers/scsi/scsi_proc.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -51,7 +52,10 @@ static ssize_t proc_scsi_host_write(struct file *file, const 
char __user *buf,
struct Scsi_Host *shost = PDE_DATA(file_inode(file));
ssize_t ret = -ENOMEM;
char *page;
-
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
if (count > PROC_BLOCK_SIZE)
return -EOVERFLOW;
 
@@ -313,6 +317,9 @@ static ssize_t proc_scsi_write(struct file *file, const 
char __user *buf,
char *buffer, *p;
int err;
 
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
if (!buf || length > PAGE_SIZE)
return -EINVAL;
 
-- 
2.14.3



Re: [PATCH V9 3/4] scsi: Align block queue to dma_get_cache_alignment()

2017-11-04 Thread kbuild test robot
Hi Huacai,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.14-rc7]
[cannot apply to next-20171103]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Huacai-Chen/dma-mapping-Rework-dma_get_cache_alignment/20171023-154436
config: m68k-sun3_defconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 4.9.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=m68k 

All errors (new ones prefixed by >>):

   drivers//scsi/scsi_lib.c: In function '__scsi_init_queue':
>> drivers//scsi/scsi_lib.c:2139:2: error: implicit declaration of function 
>> 'dma_get_cache_alignment' [-Werror=implicit-function-declaration]
 blk_queue_dma_alignment(q, max(4, dma_get_cache_alignment(dev)) - 1);
 ^
   cc1: some warnings being treated as errors

vim +/dma_get_cache_alignment +2139 drivers//scsi/scsi_lib.c

  2103  
  2104  void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
  2105  {
  2106  struct device *dev = shost->dma_dev;
  2107  
  2108  queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
  2109  
  2110  /*
  2111   * this limit is imposed by hardware restrictions
  2112   */
  2113  blk_queue_max_segments(q, min_t(unsigned short, 
shost->sg_tablesize,
  2114  SG_MAX_SEGMENTS));
  2115  
  2116  if (scsi_host_prot_dma(shost)) {
  2117  shost->sg_prot_tablesize =
  2118  min_not_zero(shost->sg_prot_tablesize,
  2119   (unsigned 
short)SCSI_MAX_PROT_SG_SEGMENTS);
  2120  BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize);
  2121  blk_queue_max_integrity_segments(q, 
shost->sg_prot_tablesize);
  2122  }
  2123  
  2124  blk_queue_max_hw_sectors(q, shost->max_sectors);
  2125  blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
  2126  blk_queue_segment_boundary(q, shost->dma_boundary);
  2127  dma_set_seg_boundary(dev, shost->dma_boundary);
  2128  
  2129  blk_queue_max_segment_size(q, dma_get_max_seg_size(dev));
  2130  
  2131  if (!shost->use_clustering)
  2132  q->limits.cluster = 0;
  2133  
  2134  /*
  2135   * set a reasonable default alignment on word/cacheline 
boundaries:
  2136   * the host and device may alter it using
  2137   * blk_queue_update_dma_alignment() later.
  2138   */
> 2139  blk_queue_dma_alignment(q, max(4, dma_get_cache_alignment(dev)) 
> - 1);
  2140  }
  2141  EXPORT_SYMBOL_GPL(__scsi_init_queue);
  2142  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH V9 3/4] scsi: Align block queue to dma_get_cache_alignment()

2017-11-04 Thread kbuild test robot
Hi Huacai,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.14-rc7]
[cannot apply to next-20171103]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Huacai-Chen/dma-mapping-Rework-dma_get_cache_alignment/20171023-154436
config: m68k-sun3_defconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 4.9.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=m68k 

All errors (new ones prefixed by >>):

   drivers//scsi/scsi_lib.c: In function '__scsi_init_queue':
>> drivers//scsi/scsi_lib.c:2139:2: error: implicit declaration of function 
>> 'dma_get_cache_alignment' [-Werror=implicit-function-declaration]
 blk_queue_dma_alignment(q, max(4, dma_get_cache_alignment(dev)) - 1);
 ^
   cc1: some warnings being treated as errors

vim +/dma_get_cache_alignment +2139 drivers//scsi/scsi_lib.c

  2103  
  2104  void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
  2105  {
  2106  struct device *dev = shost->dma_dev;
  2107  
  2108  queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
  2109  
  2110  /*
  2111   * this limit is imposed by hardware restrictions
  2112   */
  2113  blk_queue_max_segments(q, min_t(unsigned short, 
shost->sg_tablesize,
  2114  SG_MAX_SEGMENTS));
  2115  
  2116  if (scsi_host_prot_dma(shost)) {
  2117  shost->sg_prot_tablesize =
  2118  min_not_zero(shost->sg_prot_tablesize,
  2119   (unsigned 
short)SCSI_MAX_PROT_SG_SEGMENTS);
  2120  BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize);
  2121  blk_queue_max_integrity_segments(q, 
shost->sg_prot_tablesize);
  2122  }
  2123  
  2124  blk_queue_max_hw_sectors(q, shost->max_sectors);
  2125  blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
  2126  blk_queue_segment_boundary(q, shost->dma_boundary);
  2127  dma_set_seg_boundary(dev, shost->dma_boundary);
  2128  
  2129  blk_queue_max_segment_size(q, dma_get_max_seg_size(dev));
  2130  
  2131  if (!shost->use_clustering)
  2132  q->limits.cluster = 0;
  2133  
  2134  /*
  2135   * set a reasonable default alignment on word/cacheline 
boundaries:
  2136   * the host and device may alter it using
  2137   * blk_queue_update_dma_alignment() later.
  2138   */
> 2139  blk_queue_dma_alignment(q, max(4, dma_get_cache_alignment(dev)) 
> - 1);
  2140  }
  2141  EXPORT_SYMBOL_GPL(__scsi_init_queue);
  2142  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


[PATCH] drivers/pcmcia: omap1: Fix error in automated timer conversion

2017-11-04 Thread Kees Cook
One part of automated timer conversion tools did not take into account
void * variables when searching out prior direct timer callback usage,
which resulted in an attempt to dereference the timer field without a
proper type.

Reported-by: kbuild test robot
Signed-off-by: Kees Cook 
---
 drivers/pcmcia/omap_cf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c
index 8216ceb51b18..c2a17a79f0b2 100644
--- a/drivers/pcmcia/omap_cf.c
+++ b/drivers/pcmcia/omap_cf.c
@@ -102,7 +102,9 @@ static void omap_cf_timer(struct timer_list *t)
  */
 static irqreturn_t omap_cf_irq(int irq, void *_cf)
 {
-   omap_cf_timer(&_cf->timer);
+   struct omap_cf_socket *cf = (struct omap_cf_socket *)_cf;
+
+   omap_cf_timer(>timer);
return IRQ_HANDLED;
 }
 
-- 
2.7.4


-- 
Kees Cook
Pixel Security


[PATCH] drivers/pcmcia: omap1: Fix error in automated timer conversion

2017-11-04 Thread Kees Cook
One part of automated timer conversion tools did not take into account
void * variables when searching out prior direct timer callback usage,
which resulted in an attempt to dereference the timer field without a
proper type.

Reported-by: kbuild test robot
Signed-off-by: Kees Cook 
---
 drivers/pcmcia/omap_cf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c
index 8216ceb51b18..c2a17a79f0b2 100644
--- a/drivers/pcmcia/omap_cf.c
+++ b/drivers/pcmcia/omap_cf.c
@@ -102,7 +102,9 @@ static void omap_cf_timer(struct timer_list *t)
  */
 static irqreturn_t omap_cf_irq(int irq, void *_cf)
 {
-   omap_cf_timer(&_cf->timer);
+   struct omap_cf_socket *cf = (struct omap_cf_socket *)_cf;
+
+   omap_cf_timer(>timer);
return IRQ_HANDLED;
 }
 
-- 
2.7.4


-- 
Kees Cook
Pixel Security


  1   2   3   4   5   >