[PATCH v3 iproute2-next 2/4] rdma: add helper rd_sendrecv_msg()

2019-04-03 Thread Steve Wise
This function sends the constructed netlink message and then
receives the response.

Change rd_recv_msg() to display any error messages.

Change 'rdma dev set' to use rd_sendrecv_msg().

Signed-off-by: Steve Wise 
Reviewed-by: Leon Romanovsky 
---
 rdma/dev.c   |  2 +-
 rdma/rdma.h  |  2 ++
 rdma/res.h   |  1 +
 rdma/utils.c | 18 ++
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/rdma/dev.c b/rdma/dev.c
index 954e0015..33962520 100644
--- a/rdma/dev.c
+++ b/rdma/dev.c
@@ -268,7 +268,7 @@ static int dev_set_name(struct rd *rd)
mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd_argv(rd));
 
-   return rd_send_msg(rd);
+   return rd_sendrecv_msg(rd, seq);
 }
 
 static int dev_one_set(struct rd *rd)
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 1022e9a2..6c7f7d15 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -68,6 +68,7 @@ struct rd {
json_writer_t *jw;
bool json_output;
bool pretty_output;
+   bool suppress_errors;
struct list_head filter_list;
 };
 
@@ -119,6 +120,7 @@ bool rd_is_string_filtered_attr(struct rd *rd, const char 
*key, const char *val,
  */
 int rd_send_msg(struct rd *rd);
 int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t seq);
+int rd_sendrecv_msg(struct rd *rd, unsigned int seq);
 void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t 
flags);
 int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
 int rd_attr_cb(const struct nlattr *attr, void *data);
diff --git a/rdma/res.h b/rdma/res.h
index b4a7e552..525171fc 100644
--- a/rdma/res.h
+++ b/rdma/res.h
@@ -31,6 +31,7 @@ int res_qp_idx_parse_cb(const struct nlmsghdr *nlh, void 
*data);
if (id) {   
   \
ret = rd_doit_index(rd, &idx);  
   \
if (ret) {  
   \
+   rd->suppress_errors = true; 
   \
ret = _res_send_idx_msg(rd, command,
   \
name##_idx_parse_cb,
   \
idx, id);   
   \
diff --git a/rdma/utils.c b/rdma/utils.c
index 1f6bf330..11ed8a73 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -693,10 +693,28 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void 
*data, unsigned int seq)
ret = mnl_cb_run(buf, ret, seq, portid, callback, data);
} while (ret > 0);
 
+   if (ret < 0 && !rd->suppress_errors)
+   perror("error");
+
mnl_socket_close(rd->nl);
return ret;
 }
 
+static int null_cb(const struct nlmsghdr *nlh, void *data)
+{
+   return MNL_CB_OK;
+}
+
+int rd_sendrecv_msg(struct rd *rd, unsigned int seq)
+{
+   int ret;
+
+   ret = rd_send_msg(rd);
+   if (!ret)
+   ret = rd_recv_msg(rd, null_cb, rd, seq);
+   return ret;
+}
+
 static struct dev_map *_dev_map_lookup(struct rd *rd, const char *dev_name)
 {
struct dev_map *dev_map;
-- 
2.20.1



[PATCH v3 iproute2-next 1/4] Add .mailmap file

2019-04-03 Thread Steve Wise
.mailmap allows tracking multiple email addresses to the proper user name.

Signed-off-by: Steve Wise 
Reviewed-by: Leon Romanovsky 
---
 .mailmap | 8 
 1 file changed, 8 insertions(+)
 create mode 100644 .mailmap

diff --git a/.mailmap b/.mailmap
new file mode 100644
index ..c012d3d0
--- /dev/null
+++ b/.mailmap
@@ -0,0 +1,8 @@
+#
+# This list is used by git-shortlog to fix a few botched name translations
+# in the git archive, either because the author's full name was messed up
+# and/or not always written the same way, making contributions from the
+# same person appearing not to be so or badly displayed.
+#
+Steve Wise  
+Steve Wise  
-- 
2.20.1



[PATCH v3 iproute2-next 0/4] Dynamic rdma link creation

2019-04-03 Thread Steve Wise
This series adds rdmatool support for creating/deleting rdma links.
This will be used, mainly, by soft rdma drivers to allow adding/deleting
rdma links over netdev interfaces.  It provides the user side for
the following kernel changes merged in linux-5.1.

Changes since v2:

- move checks for required parameters in the parameter handlers
- move final 'link add' processing to link_add_netdev()
- added reviewed-by tags

Changes since v1:

- move error receive checking from rd_sendrecv_msg() to rd_recv_msg().
- Add rd->suppress_errors to allow control over whether errors when
  reading a response should be ignored.  Namely: resource queries can
  get errors like "none found" when querying for resources, and this
  error should not be displayed.  So on a rd object basis, error
  suppression can be controlled.
- Rebased on rdma/for-next UABI (no need to sync rdma_netlink.h now)
- use chains of struct rd_cmd and rd_exec_cmd vs open coding the parsing
  for the 'link add' command.
- minor nit resolution
- added .mailmap file.  If this is not desired for iproute2, then please
  drop the patch.

Changes since RFC:

- add rd_sendrecv_msg() and make use of it in dev_set as well
  as the new link commands.
- fixed problems with the man pages
- changed the command line to use "netdev" as the keyword
  for the network device, do avoid confused with the ib_device
  name.
- got rid of the "type" parameter for link delete.  Also pass
  down the device index instead of the name, using the common
  rd services for validating the device name and fetching the
  index.

Thanks,

Steve.

Steve Wise (4):
  Add .mailmap file
  rdma: add helper rd_sendrecv_msg()
  rdma: add 'link add/delete' commands
  rdma: man page update for link add/delete

 .mailmap |  8 +
 man/man8/rdma-link.8 | 47 ++
 rdma/dev.c   |  2 +-
 rdma/link.c  | 78 
 rdma/rdma.h  |  4 +++
 rdma/res.h   |  1 +
 rdma/utils.c | 18 ++
 7 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 .mailmap

-- 
2.20.1



[PATCH v3 iproute2-next 3/4] rdma: add 'link add/delete' commands

2019-04-03 Thread Steve Wise
Add new 'link' subcommand 'add' and 'delete' to allow binding a soft-rdma
device to a netdev interface.

EG:

rdma link add rxe_eth0 type rxe netdev eth0
rdma link delete rxe_eth0

Signed-off-by: Steve Wise 
Reviewed-by: Leon Romanovsky 
---
 rdma/link.c | 78 +
 rdma/rdma.h |  2 ++
 2 files changed, 80 insertions(+)

diff --git a/rdma/link.c b/rdma/link.c
index 89e81b84..10b2e513 100644
--- a/rdma/link.c
+++ b/rdma/link.c
@@ -9,6 +9,9 @@
 static int link_help(struct rd *rd)
 {
pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename);
+   pr_out("Usage: %s link add NAME type TYPE netdev NETDEV\n",
+  rd->filename);
+   pr_out("Usage: %s link delete NAME\n", rd->filename);
return 0;
 }
 
@@ -336,10 +339,85 @@ static int link_show(struct rd *rd)
return rd_exec_link(rd, link_one_show, true);
 }
 
+static int link_add_netdev(struct rd *rd)
+{
+   char *link_netdev;
+   uint32_t seq;
+
+   if (rd_no_arg(rd)) {
+   pr_err("Please provide a net device name.\n");
+   return -EINVAL;
+   }
+
+   link_netdev = rd_argv(rd);
+   rd_prepare_msg(rd, RDMA_NLDEV_CMD_NEWLINK, &seq,
+  (NLM_F_REQUEST | NLM_F_ACK));
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd->link_name);
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_LINK_TYPE, rd->link_type);
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_NDEV_NAME, link_netdev);
+   return rd_sendrecv_msg(rd, seq);
+}
+
+static int link_add_type(struct rd *rd)
+{
+   const struct rd_cmd cmds[] = {
+   { NULL, link_help},
+   { "netdev", link_add_netdev},
+   { 0 }
+   };
+
+   if (rd_no_arg(rd)) {
+   pr_err("Please provide a link type name.\n");
+   return -EINVAL;
+   }
+   rd->link_type = rd_argv(rd);
+   rd_arg_inc(rd);
+   return rd_exec_cmd(rd, cmds, "parameter");
+}
+
+static int link_add(struct rd *rd)
+{
+   const struct rd_cmd cmds[] = {
+   { NULL, link_help},
+   { "type",   link_add_type},
+   { 0 }
+   };
+
+   if (rd_no_arg(rd)) {
+   pr_err("Please provide a link name to add.\n");
+   return -EINVAL;
+   }
+   rd->link_name = rd_argv(rd);
+   rd_arg_inc(rd);
+
+   return rd_exec_cmd(rd, cmds, "parameter");
+}
+
+static int _link_del(struct rd *rd)
+{
+   uint32_t seq;
+
+   if (!rd_no_arg(rd)) {
+   pr_err("Unknown parameter %s\n", rd_argv(rd));
+   return -EINVAL;
+   }
+   rd_prepare_msg(rd, RDMA_NLDEV_CMD_DELLINK, &seq,
+  (NLM_F_REQUEST | NLM_F_ACK));
+   mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
+   return rd_sendrecv_msg(rd, seq);
+}
+
+static int link_del(struct rd *rd)
+{
+   return rd_exec_require_dev(rd, _link_del);
+}
+
 int cmd_link(struct rd *rd)
 {
const struct rd_cmd cmds[] = {
{ NULL, link_show },
+   { "add",link_add },
+   { "delete", link_del },
{ "show",   link_show },
{ "list",   link_show },
{ "help",   link_help },
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 6c7f7d15..9ed9e045 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -70,6 +70,8 @@ struct rd {
bool pretty_output;
bool suppress_errors;
struct list_head filter_list;
+   char *link_name;
+   char *link_type;
 };
 
 struct rd_cmd {
-- 
2.20.1



[PATCH v3 iproute2-next 4/4] rdma: man page update for link add/delete

2019-04-03 Thread Steve Wise
Update the 'rdma link' man page with 'link add/delete' info.

Signed-off-by: Steve Wise 
---
 man/man8/rdma-link.8 | 47 
 1 file changed, 47 insertions(+)

diff --git a/man/man8/rdma-link.8 b/man/man8/rdma-link.8
index bddf3474..b3b40de7 100644
--- a/man/man8/rdma-link.8
+++ b/man/man8/rdma-link.8
@@ -22,6 +22,18 @@ rdma-link \- rdma link configuration
 .B rdma link show
 .RI "[ " DEV/PORT_INDEX " ]"
 
+.ti -8
+.B rdma link add
+.BR NAME
+.BR type
+.BR TYPE
+.BR netdev
+.BR NETDEV
+
+.ti -8
+.B rdma link delete
+.RI NAME
+
 .ti -8
 .B rdma link help
 
@@ -33,6 +45,31 @@ rdma-link \- rdma link configuration
 - specifies the RDMA link to show.
 If this argument is omitted all links are listed.
 
+.SS rdma link add NAME type TYPE netdev NETDEV - add an rdma link for the 
specified type to the network device
+.sp
+.BR NAME
+- specifies the new name of the rdma link to add
+
+.BR TYPE
+- specifies which rdma type to use.  Link types:
+.sp
+.in +8
+.B rxe
+- Soft RoCE driver
+.sp
+.B siw
+- Soft iWARP driver
+.in -8
+
+.BR NETDEV
+- specifies the network device to which the link is bound
+
+.SS rdma link delete NAME - delete an rdma link
+.PP
+.BR NAME
+- specifies the name of the rdma link to delete
+.PP
+
 .SH "EXAMPLES"
 .PP
 rdma link show
@@ -45,6 +82,16 @@ rdma link show mlx5_2/1
 Shows the state of specified rdma link.
 .RE
 .PP
+rdma link add rxe_eth0 type rxe netdev eth0
+.RS 4
+Adds a RXE link named rxe_eth0 to network device eth0
+.RE
+.PP
+rdma link del rxe_eth0
+.RS 4
+Removes RXE link rxe_eth0
+.RE
+.PP
 
 .SH SEE ALSO
 .BR rdma (8),
-- 
2.20.1



Re: [PATCH v2 iproute2-next 2/4] rdma: add helper rd_sendrecv_msg()

2019-04-03 Thread Steve Wise
On Sun, Mar 31, 2019 at 9:06 PM David Ahern  wrote:
>
> On 3/26/19 1:18 PM, Steve Wise wrote:
> > This function sends the constructed netlink message and then
> > receives the response.
> >
> > Change rd_recv_msg() to display any error messages.
> >
> > Change 'rdma dev set' to use rd_sendrecv_msg().
> >
> > Signed-off-by: Steve Wise 
> > ---
> >  rdma/dev.c   |  2 +-
> >  rdma/rdma.h  |  2 ++
> >  rdma/res.h   |  1 +
> >  rdma/utils.c | 18 ++
> >  4 files changed, 22 insertions(+), 1 deletion(-)
> >
> > diff --git a/rdma/dev.c b/rdma/dev.c
> > index 954e0015..33962520 100644
> > --- a/rdma/dev.c
> > +++ b/rdma/dev.c
> > @@ -268,7 +268,7 @@ static int dev_set_name(struct rd *rd)
> >   mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
> >   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd_argv(rd));
> >
> > - return rd_send_msg(rd);
> > + return rd_sendrecv_msg(rd, seq);
> >  }
> >
> >  static int dev_one_set(struct rd *rd)
> > diff --git a/rdma/rdma.h b/rdma/rdma.h
> > index 1022e9a2..6c7f7d15 100644
> > --- a/rdma/rdma.h
> > +++ b/rdma/rdma.h
> > @@ -68,6 +68,7 @@ struct rd {
> >   json_writer_t *jw;
> >   bool json_output;
> >   bool pretty_output;
> > + bool suppress_errors;
> >   struct list_head filter_list;
> >  };
> >
>
> All of the suppress_errors seems like an unrelated change.
>

Hey David,

I just realized I replied only to you directly, so I'll reply again to everyone.

It is a bug fix that showed itself when I moved the error check to
rd_recv_msg(), which Leon recommended in the last review cycle.  The
bug doesn't manifest unless this series is applied, so the suppress
errors  fix is really only needed as part of this series.

Thanks,

Steve.


Re: [PATCH v2 iproute2-next 3/4] rdma: add 'link add/delete' commands

2019-04-02 Thread Steve Wise
On Mon, Apr 1, 2019 at 1:37 AM Leon Romanovsky  wrote:
>
> On Tue, Mar 26, 2019 at 02:18:29PM -0500, Steve Wise wrote:
> > Add new 'link' subcommand 'add' and 'delete' to allow binding a soft-rdma
> > device to a netdev interface.
> >
> > EG:
> >
> > rdma link add rxe_eth0 type rxe netdev eth0
> > rdma link delete rxe_eth0
> >
> > Signed-off-by: Steve Wise 
> > ---
> >  rdma/link.c | 83 +
> >  rdma/rdma.h |  2 ++
> >  2 files changed, 85 insertions(+)
> >
> > diff --git a/rdma/link.c b/rdma/link.c
> > index 89e81b84..982c2b16 100644
> > --- a/rdma/link.c
> > +++ b/rdma/link.c
> > @@ -9,6 +9,9 @@
> >  static int link_help(struct rd *rd)
> >  {
> >   pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename);
> > + pr_out("Usage: %s link add NAME type TYPE netdev NETDEV\n",
> > +rd->filename);
> > + pr_out("Usage: %s link delete NAME\n", rd->filename);
> >   return 0;
> >  }
> >
> > @@ -336,10 +339,90 @@ static int link_show(struct rd *rd)
> >   return rd_exec_link(rd, link_one_show, true);
> >  }
> >
> > +static int link_add_netdev(struct rd *rd)
> > +{
> > + rd->link_netdev = rd_argv(rd);
> > + rd_arg_inc(rd);
> > + return 0;
> > +}
> > +
> > +static int link_add_type(struct rd *rd)
> > +{
> > + const struct rd_cmd cmds[] = {
> > + { NULL, link_help},
> > + { "netdev", link_add_netdev},
> > + { 0 }
> > + };
> > + rd->link_type = rd_argv(rd);
> > + rd_arg_inc(rd);
> > + return rd_exec_cmd(rd, cmds, "parameter");
> > +}
> > +
> > +static int link_add(struct rd *rd)
> > +{
> > + const struct rd_cmd cmds[] = {
> > + { NULL, link_help},
> > + { "type",   link_add_type},
> > + { 0 }
> > + };
> > +
> > + uint32_t seq;
> > + char *name;
> > + int ret;
> > +
> > + if (rd_no_arg(rd)) {
> > + pr_err("Please provide a link name to add.\n");
> > + return -EINVAL;
> > + }
> > + name = rd_argv(rd);
> > + rd_arg_inc(rd);
> > +
> > + ret = rd_exec_cmd(rd, cmds, "parameter");
> > + if (ret)
> > + return ret;
> > +
> > + if (!rd->link_type) {
> > + pr_err("Please provide a link type name.\n");
> > + return -EINVAL;
> > + }
> > + if (!rd->link_netdev) {
> > + pr_err("Please provide a net device name.\n");
> > + return -EINVAL;
> > + }
>
> After a little bit more thinking, the checks above are supposed to be
> part of rd_exec_cmd() execution of chains. E.g if (!rd->link_type) needs
> to be inside link_add_type().
>

Ok.  I'll respin and add your reviewed-by tags.

Thanks,
Steve


[PATCH v2 iproute2-next 2/4] rdma: add helper rd_sendrecv_msg()

2019-03-26 Thread Steve Wise
This function sends the constructed netlink message and then
receives the response.

Change rd_recv_msg() to display any error messages.

Change 'rdma dev set' to use rd_sendrecv_msg().

Signed-off-by: Steve Wise 
---
 rdma/dev.c   |  2 +-
 rdma/rdma.h  |  2 ++
 rdma/res.h   |  1 +
 rdma/utils.c | 18 ++
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/rdma/dev.c b/rdma/dev.c
index 954e0015..33962520 100644
--- a/rdma/dev.c
+++ b/rdma/dev.c
@@ -268,7 +268,7 @@ static int dev_set_name(struct rd *rd)
mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd_argv(rd));
 
-   return rd_send_msg(rd);
+   return rd_sendrecv_msg(rd, seq);
 }
 
 static int dev_one_set(struct rd *rd)
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 1022e9a2..6c7f7d15 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -68,6 +68,7 @@ struct rd {
json_writer_t *jw;
bool json_output;
bool pretty_output;
+   bool suppress_errors;
struct list_head filter_list;
 };
 
@@ -119,6 +120,7 @@ bool rd_is_string_filtered_attr(struct rd *rd, const char 
*key, const char *val,
  */
 int rd_send_msg(struct rd *rd);
 int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t seq);
+int rd_sendrecv_msg(struct rd *rd, unsigned int seq);
 void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t 
flags);
 int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
 int rd_attr_cb(const struct nlattr *attr, void *data);
diff --git a/rdma/res.h b/rdma/res.h
index b4a7e552..525171fc 100644
--- a/rdma/res.h
+++ b/rdma/res.h
@@ -31,6 +31,7 @@ int res_qp_idx_parse_cb(const struct nlmsghdr *nlh, void 
*data);
if (id) {   
   \
ret = rd_doit_index(rd, &idx);  
   \
if (ret) {  
   \
+   rd->suppress_errors = true; 
   \
ret = _res_send_idx_msg(rd, command,
   \
name##_idx_parse_cb,
   \
idx, id);   
   \
diff --git a/rdma/utils.c b/rdma/utils.c
index 1f6bf330..11ed8a73 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -693,10 +693,28 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void 
*data, unsigned int seq)
ret = mnl_cb_run(buf, ret, seq, portid, callback, data);
} while (ret > 0);
 
+   if (ret < 0 && !rd->suppress_errors)
+   perror("error");
+
mnl_socket_close(rd->nl);
return ret;
 }
 
+static int null_cb(const struct nlmsghdr *nlh, void *data)
+{
+   return MNL_CB_OK;
+}
+
+int rd_sendrecv_msg(struct rd *rd, unsigned int seq)
+{
+   int ret;
+
+   ret = rd_send_msg(rd);
+   if (!ret)
+   ret = rd_recv_msg(rd, null_cb, rd, seq);
+   return ret;
+}
+
 static struct dev_map *_dev_map_lookup(struct rd *rd, const char *dev_name)
 {
struct dev_map *dev_map;
-- 
2.20.1



[PATCH v2 iproute2-next 0/4] Dynamic rdma link creation

2019-03-26 Thread Steve Wise
This series adds rdmatool support for creating/deleting rdma links.
This will be used, mainly, by soft rdma drivers to allow adding/deleting
rdma links over netdev interfaces.  It provides the user side for
the following kernel changes merged in linux-5.1.

Changes since v1:

- move error receive checking from rd_sendrecv_msg() to rd_recv_msg().
- Add rd->suppress_errors to allow control over whether errors when
  reading a response should be ignored.  Namely: resource queries can
  get errors like "none found" when querying for resources, and this
  error should not be displayed.  So on a rd object basis, error
  suppression can be controlled.
- Rebased on rdma/for-next UABI (no need to sync rdma_netlink.h now)
- use chains of struct rd_cmd and rd_exec_cmd vs open coding the parsing
  for the 'link add' command.
- minor nit resolution
- added .mailmap file.  If this is not desired for iproute2, then please
  drop the patch.

Changes since RFC:

- add rd_sendrecv_msg() and make use of it in dev_set as well
  as the new link commands.
- fixed problems with the man pages
- changed the command line to use "netdev" as the keyword
  for the network device, do avoid confused with the ib_device
  name.
- got rid of the "type" parameter for link delete.  Also pass
  down the device index instead of the name, using the common
  rd services for validating the device name and fetching the
  index.

Thanks,

Steve.



Steve Wise (4):
  Add .mailmap file
  rdma: add helper rd_sendrecv_msg()
  rdma: add 'link add/delete' commands
  rdma: man page update for link add/delete

 .mailmap |  8 +
 man/man8/rdma-link.8 | 47 +
 rdma/dev.c   |  2 +-
 rdma/link.c  | 83 
 rdma/rdma.h  |  4 +++
 rdma/res.h   |  1 +
 rdma/utils.c | 18 ++
 7 files changed, 162 insertions(+), 1 deletion(-)
 create mode 100644 .mailmap

-- 
2.20.1



[PATCH v2 iproute2-next 1/4] Add .mailmap file

2019-03-26 Thread Steve Wise
.mailmap allows tracking multiple email addresses to the proper user name.

Signed-off-by: Steve Wise 
---
 .mailmap | 8 
 1 file changed, 8 insertions(+)
 create mode 100644 .mailmap

diff --git a/.mailmap b/.mailmap
new file mode 100644
index ..c012d3d0
--- /dev/null
+++ b/.mailmap
@@ -0,0 +1,8 @@
+#
+# This list is used by git-shortlog to fix a few botched name translations
+# in the git archive, either because the author's full name was messed up
+# and/or not always written the same way, making contributions from the
+# same person appearing not to be so or badly displayed.
+#
+Steve Wise  
+Steve Wise  
-- 
2.20.1



[PATCH v2 iproute2-next 4/4] rdma: man page update for link add/delete

2019-03-26 Thread Steve Wise
Update the 'rdma link' man page with 'link add/delete' info.

Signed-off-by: Steve Wise 
---
 man/man8/rdma-link.8 | 47 
 1 file changed, 47 insertions(+)

diff --git a/man/man8/rdma-link.8 b/man/man8/rdma-link.8
index bddf3474..b3b40de7 100644
--- a/man/man8/rdma-link.8
+++ b/man/man8/rdma-link.8
@@ -22,6 +22,18 @@ rdma-link \- rdma link configuration
 .B rdma link show
 .RI "[ " DEV/PORT_INDEX " ]"
 
+.ti -8
+.B rdma link add
+.BR NAME
+.BR type
+.BR TYPE
+.BR netdev
+.BR NETDEV
+
+.ti -8
+.B rdma link delete
+.RI NAME
+
 .ti -8
 .B rdma link help
 
@@ -33,6 +45,31 @@ rdma-link \- rdma link configuration
 - specifies the RDMA link to show.
 If this argument is omitted all links are listed.
 
+.SS rdma link add NAME type TYPE netdev NETDEV - add an rdma link for the 
specified type to the network device
+.sp
+.BR NAME
+- specifies the new name of the rdma link to add
+
+.BR TYPE
+- specifies which rdma type to use.  Link types:
+.sp
+.in +8
+.B rxe
+- Soft RoCE driver
+.sp
+.B siw
+- Soft iWARP driver
+.in -8
+
+.BR NETDEV
+- specifies the network device to which the link is bound
+
+.SS rdma link delete NAME - delete an rdma link
+.PP
+.BR NAME
+- specifies the name of the rdma link to delete
+.PP
+
 .SH "EXAMPLES"
 .PP
 rdma link show
@@ -45,6 +82,16 @@ rdma link show mlx5_2/1
 Shows the state of specified rdma link.
 .RE
 .PP
+rdma link add rxe_eth0 type rxe netdev eth0
+.RS 4
+Adds a RXE link named rxe_eth0 to network device eth0
+.RE
+.PP
+rdma link del rxe_eth0
+.RS 4
+Removes RXE link rxe_eth0
+.RE
+.PP
 
 .SH SEE ALSO
 .BR rdma (8),
-- 
2.20.1



[PATCH v2 iproute2-next 3/4] rdma: add 'link add/delete' commands

2019-03-26 Thread Steve Wise
Add new 'link' subcommand 'add' and 'delete' to allow binding a soft-rdma
device to a netdev interface.

EG:

rdma link add rxe_eth0 type rxe netdev eth0
rdma link delete rxe_eth0

Signed-off-by: Steve Wise 
---
 rdma/link.c | 83 +
 rdma/rdma.h |  2 ++
 2 files changed, 85 insertions(+)

diff --git a/rdma/link.c b/rdma/link.c
index 89e81b84..982c2b16 100644
--- a/rdma/link.c
+++ b/rdma/link.c
@@ -9,6 +9,9 @@
 static int link_help(struct rd *rd)
 {
pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename);
+   pr_out("Usage: %s link add NAME type TYPE netdev NETDEV\n",
+  rd->filename);
+   pr_out("Usage: %s link delete NAME\n", rd->filename);
return 0;
 }
 
@@ -336,10 +339,90 @@ static int link_show(struct rd *rd)
return rd_exec_link(rd, link_one_show, true);
 }
 
+static int link_add_netdev(struct rd *rd)
+{
+   rd->link_netdev = rd_argv(rd);
+   rd_arg_inc(rd);
+   return 0;
+}
+
+static int link_add_type(struct rd *rd)
+{
+   const struct rd_cmd cmds[] = {
+   { NULL, link_help},
+   { "netdev", link_add_netdev},
+   { 0 }
+   };
+   rd->link_type = rd_argv(rd);
+   rd_arg_inc(rd);
+   return rd_exec_cmd(rd, cmds, "parameter");
+}
+
+static int link_add(struct rd *rd)
+{
+   const struct rd_cmd cmds[] = {
+   { NULL, link_help},
+   { "type",   link_add_type},
+   { 0 }
+   };
+
+   uint32_t seq;
+   char *name;
+   int ret;
+
+   if (rd_no_arg(rd)) {
+   pr_err("Please provide a link name to add.\n");
+   return -EINVAL;
+   }
+   name = rd_argv(rd);
+   rd_arg_inc(rd);
+
+   ret = rd_exec_cmd(rd, cmds, "parameter");
+   if (ret)
+   return ret;
+
+   if (!rd->link_type) {
+   pr_err("Please provide a link type name.\n");
+   return -EINVAL;
+   }
+   if (!rd->link_netdev) {
+   pr_err("Please provide a net device name.\n");
+   return -EINVAL;
+   }
+
+   rd_prepare_msg(rd, RDMA_NLDEV_CMD_NEWLINK, &seq,
+  (NLM_F_REQUEST | NLM_F_ACK));
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, name);
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_LINK_TYPE, rd->link_type);
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_NDEV_NAME, rd->link_netdev);
+   return rd_sendrecv_msg(rd, seq);
+}
+
+static int _link_del(struct rd *rd)
+{
+   uint32_t seq;
+
+   if (!rd_no_arg(rd)) {
+   pr_err("Unknown parameter %s\n", rd_argv(rd));
+   return -EINVAL;
+   }
+   rd_prepare_msg(rd, RDMA_NLDEV_CMD_DELLINK, &seq,
+  (NLM_F_REQUEST | NLM_F_ACK));
+   mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
+   return rd_sendrecv_msg(rd, seq);
+}
+
+static int link_del(struct rd *rd)
+{
+   return rd_exec_require_dev(rd, _link_del);
+}
+
 int cmd_link(struct rd *rd)
 {
const struct rd_cmd cmds[] = {
{ NULL, link_show },
+   { "add",link_add },
+   { "delete", link_del },
{ "show",   link_show },
{ "list",   link_show },
{ "help",   link_help },
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 6c7f7d15..070c8ddf 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -70,6 +70,8 @@ struct rd {
bool pretty_output;
bool suppress_errors;
struct list_head filter_list;
+   char *link_type;
+   char *link_netdev;
 };
 
 struct rd_cmd {
-- 
2.20.1



Re: [PATCH v1 iproute2-next 1/4] rdma: add helper rd_sendrecv_msg()

2019-03-06 Thread Steve Wise


On 3/4/2019 8:13 AM, Steve Wise wrote:
> Hey Leon, adding this to rd_recv_msg():
>
> @@ -693,10 +693,28 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void
> *data, unsigned int seq)
> ret = mnl_cb_run(buf, ret, seq, portid, callback, data);
> } while (ret > 0);
>
> +   if (ret < 0)
> +   perror(NULL);
> +
> mnl_socket_close(rd->nl);
> return ret;
>  }
>
> Results in unexpected errors being logged when doing a query such as:
>
> [root@stevo1 iproute2]# ./rdma/rdma res show qp lqpn 176
> error: Invalid argument
> link mlx5_0/1 lqpn 176 type UD state RTS sq-psn 0 comm [ib_core]
> error: Invalid argument
> error: No such file or directory
> error: Invalid argument
> error: No such file or directory
>
> It appears the "invalid argument" errors are due to rdmatool sending a
> RDMA_NLDEV_CMD_RES_QP_GET command using the doit kernel method to allow
> querying for just a QP with lqpn = 176.  However, rdmatool isn't passing a
> port index in the messages that generate the "invalid argument" error from
> the kernel.  IE you must provide a device index and port index when issuing
> a doit command vs a dumpit command.  I think. 
>
> This error was not found because rd_recv_msg() never displayed any errors
> previously.  Further, the RES_FUNC() massive macro has code that will retry
> a failed doit call with a dumpit call.  I think _##name() should distinguish
> between failures reported by the kernel doit function vs failures because no
> doit function exists.  Not sure how to support that.
>
>
> static inline int _##name(struct rd *rd)
> \
> {
> \
> uint32_t idx;
> \
> int ret;
> \
> if (id) {
> \
> ret = rd_doit_index(rd, &idx);
> \
> if (ret) {
> \
> ret = _res_send_idx_msg(rd, command,
> \
> name##_idx_parse_cb,
> \
> idx, id);
> \
> if (!ret)
> \
> return ret;
> \
> /* Fallback for old systems without .doit
> callbacks */ \
> }
> \
> }
> \
> return _res_send_msg(rd, command, name##_parse_cb);
> \
> }
> \
>
>
>
> The "no such file or dir" errors are being returned because, in my setup,
> there are 2 other links that do not have lqpn 176.   So there are 2 issues
> uncovered by adding generic printing of errors in rd_recv_msg()
>
> 1) the doit code in rdmatool is generating requests for a doit method in the
> kernel w/o providing a port index.
> 2) some paths in rdmatool should not print "benign" errors like filtering on
> a GET command causing a "does not exist" error returned by the kernel doit
> func.
>
> #1 is a bug, IMO.  Can you propose a fix?
> #2 could be solved by adding an error callback func passed to rd_recv_msg().
> Then the RES_FUNC() functions could parse errors like "no such file or dir"
> when doing a filtered query and silently drop them.  And functions like
> dev_set_name() would display all errors returned because there are no
> expected errors other than "success".
>
> Steve.
>

Hey Leon, you've been quiet. :)   Thoughts?

Thanks,

Steve.




RE: [PATCH v1 iproute2-next 1/4] rdma: add helper rd_sendrecv_msg()

2019-03-04 Thread Steve Wise



> > >
> > > On 2/23/2019 3:26 AM, Leon Romanovsky wrote:
> > > > On Thu, Feb 21, 2019 at 08:19:03AM -0800, Steve Wise wrote:
> > > >> This function sends the constructed netlink message and then
> > > >> receives the response, displaying any error text.
> > > >>
> > > >> Change 'rdma dev set' to use it.
> > > >>
> > > >> Signed-off-by: Steve Wise 
> > > >> ---
> > > >>  rdma/dev.c   |  2 +-
> > > >>  rdma/rdma.h  |  1 +
> > > >>  rdma/utils.c | 21 +
> > > >>  3 files changed, 23 insertions(+), 1 deletion(-)
> > > >>
> > > >> diff --git a/rdma/dev.c b/rdma/dev.c
> > > >> index 60ff4b31e320..d2949c378f08 100644
> > > >> --- a/rdma/dev.c
> > > >> +++ b/rdma/dev.c
> > > >> @@ -273,7 +273,7 @@ static int dev_set_name(struct rd *rd)
> > > >>mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd-
> > > >dev_idx);
> > > >>mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME,
> > > rd_argv(rd));
> > > >>
> > > >> -  return rd_send_msg(rd);
> > > >> +  return rd_sendrecv_msg(rd, seq);
> > > >>  }
> > > >>
> > > >>  static int dev_one_set(struct rd *rd)
> > > >> diff --git a/rdma/rdma.h b/rdma/rdma.h
> > > >> index 547bb5749a39..20be2f12c4f8 100644
> > > >> --- a/rdma/rdma.h
> > > >> +++ b/rdma/rdma.h
> > > >> @@ -115,6 +115,7 @@ bool rd_check_is_key_exist(struct rd *rd, const
> > > char *key);
> > > >>   */
> > > >>  int rd_send_msg(struct rd *rd);
> > > >>  int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data,
uint32_t
> > > seq);
> > > >> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq);
> > > >>  void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq,
> > uint16_t
> > > flags);
> > > >>  int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
> > > >>  int rd_attr_cb(const struct nlattr *attr, void *data);
> > > >> diff --git a/rdma/utils.c b/rdma/utils.c
> > > >> index 069d44fece10..a6f2826c9605 100644
> > > >> --- a/rdma/utils.c
> > > >> +++ b/rdma/utils.c
> > > >> @@ -664,6 +664,27 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t
> callback,
> > > void *data, unsigned int seq)
> > > >>return ret;
> > > >>  }
> > > >>
> > > >> +static int null_cb(const struct nlmsghdr *nlh, void *data)
> > > >> +{
> > > >> +  return MNL_CB_OK;
> > > >> +}
> > > >> +
> > > >> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq)
> > > >> +{
> > > >> +  int ret;
> > > >> +
> > > >> +  ret = rd_send_msg(rd);
> > > >> +  if (ret) {
> > > >> +  perror(NULL);
> > > > This is more or less already done in rd_send_msg() and that function
> > > > prints something in case of execution error. So the missing piece
> > > > is to update rd_recv_msg(), so all places will "magically" print
errors
> > > > and not only dev_set_name().
> > >
> > > Yea ok.
> > >
> >
> > dev_set_name() doesn't call rd_recv_msg().  So you're suggesting I fix
up
> > rd_recv_msg() to display errors and make dev_set_name() call
> rd_recv_msg()
> > with the null_cb function?  You sure that's the way to go?
> 
> I'm sure that we need to fix dev_set_name(), everything else I'm not sure.
> 
> Thanks

Hey Leon, adding this to rd_recv_msg():

@@ -693,10 +693,28 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void
*data, unsigned int seq)
ret = mnl_cb_run(buf, ret, seq, portid, callback, data);
} while (ret > 0);

+   if (ret < 0)
+   perror(NULL);
+
mnl_socket_close(rd->nl);
return ret;
 }

Results in unexpected errors being logged when doing a query such as:

[root@stevo1 iproute2]# ./rdma/rdma res show qp lqpn 176
error: Invalid argument
link mlx5_0/1 lqpn 176 type UD state RTS sq-psn 0 comm [ib_core]
error: Invalid argument
error: No such file or directory
error: Invalid argument
error: No such file or directory

It appears the "invalid argument&quo

Re: [PATCH v1 iproute2-next 1/4] rdma: add helper rd_sendrecv_msg()

2019-02-28 Thread Steve Wise


On 2/28/2019 1:56 PM, Leon Romanovsky wrote:
> On Thu, Feb 28, 2019 at 01:41:20PM -0600, Steve Wise wrote:
>> On 2/23/2019 3:26 AM, Leon Romanovsky wrote:
>>> On Thu, Feb 21, 2019 at 08:19:03AM -0800, Steve Wise wrote:
>>>> This function sends the constructed netlink message and then
>>>> receives the response, displaying any error text.
>>>>
>>>> Change 'rdma dev set' to use it.
>>>>
>>>> Signed-off-by: Steve Wise 
>>>> ---
>>>>  rdma/dev.c   |  2 +-
>>>>  rdma/rdma.h  |  1 +
>>>>  rdma/utils.c | 21 +
>>>>  3 files changed, 23 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/rdma/dev.c b/rdma/dev.c
>>>> index 60ff4b31e320..d2949c378f08 100644
>>>> --- a/rdma/dev.c
>>>> +++ b/rdma/dev.c
>>>> @@ -273,7 +273,7 @@ static int dev_set_name(struct rd *rd)
>>>>mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
>>>>mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd_argv(rd));
>>>>
>>>> -  return rd_send_msg(rd);
>>>> +  return rd_sendrecv_msg(rd, seq);
>>>>  }
>>>>
>>>>  static int dev_one_set(struct rd *rd)
>>>> diff --git a/rdma/rdma.h b/rdma/rdma.h
>>>> index 547bb5749a39..20be2f12c4f8 100644
>>>> --- a/rdma/rdma.h
>>>> +++ b/rdma/rdma.h
>>>> @@ -115,6 +115,7 @@ bool rd_check_is_key_exist(struct rd *rd, const char 
>>>> *key);
>>>>   */
>>>>  int rd_send_msg(struct rd *rd);
>>>>  int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t 
>>>> seq);
>>>> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq);
>>>>  void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t 
>>>> flags);
>>>>  int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
>>>>  int rd_attr_cb(const struct nlattr *attr, void *data);
>>>> diff --git a/rdma/utils.c b/rdma/utils.c
>>>> index 069d44fece10..a6f2826c9605 100644
>>>> --- a/rdma/utils.c
>>>> +++ b/rdma/utils.c
>>>> @@ -664,6 +664,27 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, 
>>>> void *data, unsigned int seq)
>>>>return ret;
>>>>  }
>>>>
>>>> +static int null_cb(const struct nlmsghdr *nlh, void *data)
>>>> +{
>>>> +  return MNL_CB_OK;
>>>> +}
>>>> +
>>>> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq)
>>>> +{
>>>> +  int ret;
>>>> +
>>>> +  ret = rd_send_msg(rd);
>>>> +  if (ret) {
>>>> +  perror(NULL);
>>> This is more or less already done in rd_send_msg() and that function
>>> prints something in case of execution error. So the missing piece
>>> is to update rd_recv_msg(), so all places will "magically" print errors
>>> and not only dev_set_name().
>> Hey Leon,
>>
>> Displaying errors in rd_recv_msg() as you suggested:
>>
>> @@ -693,10 +693,28 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback,
>> void *data, unsigned int seq)
>>     ret = mnl_cb_run(buf, ret, seq, portid, callback, data);
>>     } while (ret > 0);
>>
>> +   if (ret < 0)
>> +   perror(NULL);
>> +
>>     mnl_socket_close(rd->nl);
>>     return ret;
>>  }
>>
>> Causes problems when doing filtered dumps:
>>
>> [root@stevo1 iproute2]# ./rdma/rdma res show qp
>> link mlx5_0/1 lqpn 1 type GSI state RTS sq-psn 0 comm [ib_core]
>> link mlx5_0/1 lqpn 176 type UD state RTS sq-psn 0 comm [ib_core]
>> link mlx5_1/1 lqpn 1 type GSI state RTS sq-psn 0 comm [ib_core]
>> link mlx5_1/1 lqpn 2224 type UD state RTS sq-psn 0 comm [ib_core]
>> link rxe_foo0/1 lqpn 1 type GSI state RTS sq-psn 0 comm [ib_core]
>> [root@stevo1 iproute2]# ./rdma/rdma res show qp  lqpn 2224
>> Invalid argument
>> No such file or directory
> Why do we it? We are supposed to check "invalid argument" before sending
> message and are not supposed to see perror(). I'm not near code right
> now, so most probably wrong in my assumption.

I'm still investigating, but I _think_ it is because mnl_run_cb(),
called by rd_recv_msg() returns the return code from the callback
function.  So the resource callback functions must be returning an error
when a returned resource doesn't match the filter.  Maybe.  It is
related to doing a rdma res dump where you specify a filter...




Re: [PATCH v1 iproute2-next 1/4] rdma: add helper rd_sendrecv_msg()

2019-02-28 Thread Steve Wise


On 2/23/2019 3:26 AM, Leon Romanovsky wrote:
> On Thu, Feb 21, 2019 at 08:19:03AM -0800, Steve Wise wrote:
>> This function sends the constructed netlink message and then
>> receives the response, displaying any error text.
>>
>> Change 'rdma dev set' to use it.
>>
>> Signed-off-by: Steve Wise 
>> ---
>>  rdma/dev.c   |  2 +-
>>  rdma/rdma.h  |  1 +
>>  rdma/utils.c | 21 +
>>  3 files changed, 23 insertions(+), 1 deletion(-)
>>
>> diff --git a/rdma/dev.c b/rdma/dev.c
>> index 60ff4b31e320..d2949c378f08 100644
>> --- a/rdma/dev.c
>> +++ b/rdma/dev.c
>> @@ -273,7 +273,7 @@ static int dev_set_name(struct rd *rd)
>>  mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
>>  mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd_argv(rd));
>>
>> -return rd_send_msg(rd);
>> +return rd_sendrecv_msg(rd, seq);
>>  }
>>
>>  static int dev_one_set(struct rd *rd)
>> diff --git a/rdma/rdma.h b/rdma/rdma.h
>> index 547bb5749a39..20be2f12c4f8 100644
>> --- a/rdma/rdma.h
>> +++ b/rdma/rdma.h
>> @@ -115,6 +115,7 @@ bool rd_check_is_key_exist(struct rd *rd, const char 
>> *key);
>>   */
>>  int rd_send_msg(struct rd *rd);
>>  int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t seq);
>> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq);
>>  void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t 
>> flags);
>>  int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
>>  int rd_attr_cb(const struct nlattr *attr, void *data);
>> diff --git a/rdma/utils.c b/rdma/utils.c
>> index 069d44fece10..a6f2826c9605 100644
>> --- a/rdma/utils.c
>> +++ b/rdma/utils.c
>> @@ -664,6 +664,27 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void 
>> *data, unsigned int seq)
>>  return ret;
>>  }
>>
>> +static int null_cb(const struct nlmsghdr *nlh, void *data)
>> +{
>> +return MNL_CB_OK;
>> +}
>> +
>> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq)
>> +{
>> +int ret;
>> +
>> +ret = rd_send_msg(rd);
>> +if (ret) {
>> +perror(NULL);
> This is more or less already done in rd_send_msg() and that function
> prints something in case of execution error. So the missing piece
> is to update rd_recv_msg(), so all places will "magically" print errors
> and not only dev_set_name().

Hey Leon,

Displaying errors in rd_recv_msg() as you suggested:

@@ -693,10 +693,28 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback,
void *data, unsigned int seq)
    ret = mnl_cb_run(buf, ret, seq, portid, callback, data);
    } while (ret > 0);

+   if (ret < 0)
+   perror(NULL);
+
    mnl_socket_close(rd->nl);
    return ret;
 }

Causes problems when doing filtered dumps:

[root@stevo1 iproute2]# ./rdma/rdma res show qp
link mlx5_0/1 lqpn 1 type GSI state RTS sq-psn 0 comm [ib_core]
link mlx5_0/1 lqpn 176 type UD state RTS sq-psn 0 comm [ib_core]
link mlx5_1/1 lqpn 1 type GSI state RTS sq-psn 0 comm [ib_core]
link mlx5_1/1 lqpn 2224 type UD state RTS sq-psn 0 comm [ib_core]
link rxe_foo0/1 lqpn 1 type GSI state RTS sq-psn 0 comm [ib_core]
[root@stevo1 iproute2]# ./rdma/rdma res show qp  lqpn 2224
Invalid argument
No such file or directory
Invalid argument
link mlx5_1/1 lqpn 2224 type UD state RTS sq-psn 0 comm [ib_core]
Invalid argument
No such file or directory
[root@stevo1 iproute2]#


Steve.



RE: [PATCH v1 iproute2-next 3/4] rdma: add 'link add/delete' commands

2019-02-27 Thread Steve Wise



> -Original Message-
> From: Steve Wise 
> Sent: Tuesday, February 26, 2019 11:25 AM
> To: Leon Romanovsky 
> Cc: dsah...@gmail.com; step...@networkplumber.org;
> netdev@vger.kernel.org; linux-r...@vger.kernel.org
> Subject: Re: [PATCH v1 iproute2-next 3/4] rdma: add 'link add/delete'
> commands
> 
> 
> On 2/23/2019 3:43 AM, Leon Romanovsky wrote:
> > On Thu, Feb 21, 2019 at 08:19:12AM -0800, Steve Wise wrote:
> >> Add new 'link' subcommand 'add' and 'delete' to allow binding a soft-
> rdma
> >> device to a netdev interface.
> >>
> >> EG:
> >>
> >> rdma link add rxe_eth0 type rxe netdev eth0
> >> rdma link delete rxe_eth0
> >>
> >> Signed-off-by: Steve Wise 
> >> ---
> >>  rdma/link.c  | 67
> 
> >>  rdma/rdma.h  |  1 +
> >>  rdma/utils.c |  2 +-
> >>  3 files changed, 69 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/rdma/link.c b/rdma/link.c
> >> index c064be627be2..afaf19663728 100644
> >> --- a/rdma/link.c
> >> +++ b/rdma/link.c
> >> @@ -14,6 +14,9 @@
> >>  static int link_help(struct rd *rd)
> >>  {
> >>pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename);
> >> +  pr_out("Usage: %s link add NAME type TYPE netdev NETDEV\n",
> >> + rd->filename);
> >> +  pr_out("Usage: %s link delete NAME\n", rd->filename);
> >>return 0;
> >>  }
> >>
> >> @@ -341,10 +344,74 @@ static int link_show(struct rd *rd)
> >>return rd_exec_link(rd, link_one_show, true);
> >>  }
> >>
> >> +static int link_add(struct rd *rd)
> >> +{
> >> +  char *name;
> >> +  char *type = NULL;
> >> +  char *dev = NULL;
> >> +  uint32_t seq;
> >> +
> >> +  if (rd_no_arg(rd)) {
> >> +  pr_err("No link name was supplied\n");
> > I think that it is better to have instruction message and not error
> > message: "Please provide ...".
> 
> 
> Ok.  Perhaps a new utility rd_exec_require_link() can be created?
> 
> 
> >> +  return -EINVAL;
> >> +  }
> >> +  name = rd_argv(rd);
> >> +  rd_arg_inc(rd);
> >> +  while (!rd_no_arg(rd)) {
> >> +  if (rd_argv_match(rd, "type")) {
> >> +  rd_arg_inc(rd);
> >> +  type = rd_argv(rd);
> >> +  } else if (rd_argv_match(rd, "netdev")) {
> >> +  rd_arg_inc(rd);
> >> +  dev = rd_argv(rd);
> >> +  } else {
> >> +  pr_err("Invalid parameter %s\n", rd_argv(rd));
> >> +  return -EINVAL;
> >> +  }
> >> +  rd_arg_inc(rd);
> > Please use chains of struct rd_cmd and rd_exec_cmd() instead of
> > open-coding parser.
> 
> 
> Ok.  Like your recently merged series did...
> 
> 
> >> +  }
> >> +  if (!type) {
> >> +  pr_err("No type was supplied\n");
> >> +  return -EINVAL;
> > General parser handle it.
> 
> 
> Ok.
> 
> 
> >> +  }
> >> +  if (!dev) {
> >> +  pr_err("No net device was supplied\n");
> >> +  return -EINVAL;
> >> +  }
> > rd_exec_require_dev() ???
> 
> 
> Looks like I can use that.  I'll try it.

Actually, in the above code, the variable 'dev' is the netdev string, and
that is the 3rd parameter to the 'link add' command, so it isn't appropriate
for rd_exec_require_dev().

Steve




RE: [PATCH v1 iproute2-next 1/4] rdma: add helper rd_sendrecv_msg()

2019-02-27 Thread Steve Wise



> -Original Message-
> From: Steve Wise 
> Sent: Tuesday, February 26, 2019 11:14 AM
> To: Leon Romanovsky 
> Cc: dsah...@gmail.com; step...@networkplumber.org;
> netdev@vger.kernel.org; linux-r...@vger.kernel.org
> Subject: Re: [PATCH v1 iproute2-next 1/4] rdma: add helper
> rd_sendrecv_msg()
> 
> 
> On 2/23/2019 3:26 AM, Leon Romanovsky wrote:
> > On Thu, Feb 21, 2019 at 08:19:03AM -0800, Steve Wise wrote:
> >> This function sends the constructed netlink message and then
> >> receives the response, displaying any error text.
> >>
> >> Change 'rdma dev set' to use it.
> >>
> >> Signed-off-by: Steve Wise 
> >> ---
> >>  rdma/dev.c   |  2 +-
> >>  rdma/rdma.h  |  1 +
> >>  rdma/utils.c | 21 +
> >>  3 files changed, 23 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/rdma/dev.c b/rdma/dev.c
> >> index 60ff4b31e320..d2949c378f08 100644
> >> --- a/rdma/dev.c
> >> +++ b/rdma/dev.c
> >> @@ -273,7 +273,7 @@ static int dev_set_name(struct rd *rd)
> >>mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd-
> >dev_idx);
> >>mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME,
> rd_argv(rd));
> >>
> >> -  return rd_send_msg(rd);
> >> +  return rd_sendrecv_msg(rd, seq);
> >>  }
> >>
> >>  static int dev_one_set(struct rd *rd)
> >> diff --git a/rdma/rdma.h b/rdma/rdma.h
> >> index 547bb5749a39..20be2f12c4f8 100644
> >> --- a/rdma/rdma.h
> >> +++ b/rdma/rdma.h
> >> @@ -115,6 +115,7 @@ bool rd_check_is_key_exist(struct rd *rd, const
> char *key);
> >>   */
> >>  int rd_send_msg(struct rd *rd);
> >>  int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t
> seq);
> >> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq);
> >>  void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq,
uint16_t
> flags);
> >>  int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
> >>  int rd_attr_cb(const struct nlattr *attr, void *data);
> >> diff --git a/rdma/utils.c b/rdma/utils.c
> >> index 069d44fece10..a6f2826c9605 100644
> >> --- a/rdma/utils.c
> >> +++ b/rdma/utils.c
> >> @@ -664,6 +664,27 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback,
> void *data, unsigned int seq)
> >>return ret;
> >>  }
> >>
> >> +static int null_cb(const struct nlmsghdr *nlh, void *data)
> >> +{
> >> +  return MNL_CB_OK;
> >> +}
> >> +
> >> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq)
> >> +{
> >> +  int ret;
> >> +
> >> +  ret = rd_send_msg(rd);
> >> +  if (ret) {
> >> +  perror(NULL);
> > This is more or less already done in rd_send_msg() and that function
> > prints something in case of execution error. So the missing piece
> > is to update rd_recv_msg(), so all places will "magically" print errors
> > and not only dev_set_name().
> 
> Yea ok.
> 

dev_set_name() doesn't call rd_recv_msg().  So you're suggesting I fix up
rd_recv_msg() to display errors and make dev_set_name() call rd_recv_msg()
with the null_cb function?  You sure that's the way to go?






Re: [PATCH v1 iproute2-next 1/4] rdma: add helper rd_sendrecv_msg()

2019-02-26 Thread Steve Wise


On 2/26/2019 1:16 PM, Leon Romanovsky wrote:
> On Tue, Feb 26, 2019 at 11:19:12AM -0600, Steve Wise wrote:
>> On 2/23/2019 3:31 AM, Leon Romanovsky wrote:
>>> On Sat, Feb 23, 2019 at 11:26:15AM +0200, Leon Romanovsky wrote:
>>>> On Thu, Feb 21, 2019 at 08:19:03AM -0800, Steve Wise wrote:
>>>>> This function sends the constructed netlink message and then
>>>>> receives the response, displaying any error text.
>>>>>
>>>>> Change 'rdma dev set' to use it.
>>>>>
>>>>> Signed-off-by: Steve Wise 
>>>>> ---
>>>>>  rdma/dev.c   |  2 +-
>>>>>  rdma/rdma.h  |  1 +
>>>>>  rdma/utils.c | 21 +
>>>>>  3 files changed, 23 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/rdma/dev.c b/rdma/dev.c
>>>>> index 60ff4b31e320..d2949c378f08 100644
>>>>> --- a/rdma/dev.c
>>>>> +++ b/rdma/dev.c
>>>>> @@ -273,7 +273,7 @@ static int dev_set_name(struct rd *rd)
>>>>>   mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
>>>>>   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd_argv(rd));
>>>>>
>>>>> - return rd_send_msg(rd);
>>>>> + return rd_sendrecv_msg(rd, seq);
>>>>>  }
>>>>>
>>>>>  static int dev_one_set(struct rd *rd)
>>>>> diff --git a/rdma/rdma.h b/rdma/rdma.h
>>>>> index 547bb5749a39..20be2f12c4f8 100644
>>>>> --- a/rdma/rdma.h
>>>>> +++ b/rdma/rdma.h
>>>>> @@ -115,6 +115,7 @@ bool rd_check_is_key_exist(struct rd *rd, const char 
>>>>> *key);
>>>>>   */
>>>>>  int rd_send_msg(struct rd *rd);
>>>>>  int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t 
>>>>> seq);
>>>>> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq);
>>>>>  void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t 
>>>>> flags);
>>>>>  int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
>>>>>  int rd_attr_cb(const struct nlattr *attr, void *data);
>>>>> diff --git a/rdma/utils.c b/rdma/utils.c
>>>>> index 069d44fece10..a6f2826c9605 100644
>>>>> --- a/rdma/utils.c
>>>>> +++ b/rdma/utils.c
>>>>> @@ -664,6 +664,27 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, 
>>>>> void *data, unsigned int seq)
>>>>>   return ret;
>>>>>  }
>>>>>
>>>>> +static int null_cb(const struct nlmsghdr *nlh, void *data)
>>>>> +{
>>>>> + return MNL_CB_OK;
>>>>> +}
>>>>> +
>>>>> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq)
>>>>> +{
>>>>> + int ret;
>>>>> +
>>>>> + ret = rd_send_msg(rd);
>>>>> + if (ret) {
>>>>> + perror(NULL);
>>>> This is more or less already done in rd_send_msg() and that function
>>>> prints something in case of execution error. So the missing piece
>>>> is to update rd_recv_msg(), so all places will "magically" print errors
>>>> and not only dev_set_name().
>>>>
>>>>> + goto out;
>>>>> + }
>>>>> + ret = rd_recv_msg(rd, null_cb, rd, seq);
>>> Will this "null_cb" work for all send/recv flows or only in flows where
>>> response can be error only?
>>
>> Only those flows where no nl attributes are expected to be returned.
>>
>>
>>> Will we need this recv_msg if we implement
>>> extack support?
>>
>> I'm not sure how extack works.  Do you know?
> I can't say that :)


We can change things if/when we support extack.

Stevo.




Re: [PATCH v1 iproute2-next 3/4] rdma: add 'link add/delete' commands

2019-02-26 Thread Steve Wise


On 2/23/2019 3:43 AM, Leon Romanovsky wrote:
> On Thu, Feb 21, 2019 at 08:19:12AM -0800, Steve Wise wrote:
>> Add new 'link' subcommand 'add' and 'delete' to allow binding a soft-rdma
>> device to a netdev interface.
>>
>> EG:
>>
>> rdma link add rxe_eth0 type rxe netdev eth0
>> rdma link delete rxe_eth0
>>
>> Signed-off-by: Steve Wise 
>> ---
>>  rdma/link.c  | 67 
>> 
>>  rdma/rdma.h  |  1 +
>>  rdma/utils.c |  2 +-
>>  3 files changed, 69 insertions(+), 1 deletion(-)
>>
>> diff --git a/rdma/link.c b/rdma/link.c
>> index c064be627be2..afaf19663728 100644
>> --- a/rdma/link.c
>> +++ b/rdma/link.c
>> @@ -14,6 +14,9 @@
>>  static int link_help(struct rd *rd)
>>  {
>>  pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename);
>> +pr_out("Usage: %s link add NAME type TYPE netdev NETDEV\n",
>> +   rd->filename);
>> +pr_out("Usage: %s link delete NAME\n", rd->filename);
>>  return 0;
>>  }
>>
>> @@ -341,10 +344,74 @@ static int link_show(struct rd *rd)
>>  return rd_exec_link(rd, link_one_show, true);
>>  }
>>
>> +static int link_add(struct rd *rd)
>> +{
>> +char *name;
>> +char *type = NULL;
>> +char *dev = NULL;
>> +uint32_t seq;
>> +
>> +if (rd_no_arg(rd)) {
>> +pr_err("No link name was supplied\n");
> I think that it is better to have instruction message and not error
> message: "Please provide ...".


Ok.  Perhaps a new utility rd_exec_require_link() can be created?


>> +return -EINVAL;
>> +}
>> +name = rd_argv(rd);
>> +rd_arg_inc(rd);
>> +while (!rd_no_arg(rd)) {
>> +if (rd_argv_match(rd, "type")) {
>> +rd_arg_inc(rd);
>> +type = rd_argv(rd);
>> +} else if (rd_argv_match(rd, "netdev")) {
>> +rd_arg_inc(rd);
>> +dev = rd_argv(rd);
>> +} else {
>> +pr_err("Invalid parameter %s\n", rd_argv(rd));
>> +return -EINVAL;
>> +}
>> +rd_arg_inc(rd);
> Please use chains of struct rd_cmd and rd_exec_cmd() instead of
> open-coding parser.


Ok.  Like your recently merged series did...


>> +}
>> +if (!type) {
>> +pr_err("No type was supplied\n");
>> +return -EINVAL;
> General parser handle it.


Ok.


>> +}
>> +if (!dev) {
>> +pr_err("No net device was supplied\n");
>> +return -EINVAL;
>> +}
> rd_exec_require_dev() ???


Looks like I can use that.  I'll try it.


Thanks!


Steve.




Re: [PATCH v1 iproute2-next 1/4] rdma: add helper rd_sendrecv_msg()

2019-02-26 Thread Steve Wise


On 2/23/2019 3:31 AM, Leon Romanovsky wrote:
> On Sat, Feb 23, 2019 at 11:26:15AM +0200, Leon Romanovsky wrote:
>> On Thu, Feb 21, 2019 at 08:19:03AM -0800, Steve Wise wrote:
>>> This function sends the constructed netlink message and then
>>> receives the response, displaying any error text.
>>>
>>> Change 'rdma dev set' to use it.
>>>
>>> Signed-off-by: Steve Wise 
>>> ---
>>>  rdma/dev.c   |  2 +-
>>>  rdma/rdma.h  |  1 +
>>>  rdma/utils.c | 21 +
>>>  3 files changed, 23 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/rdma/dev.c b/rdma/dev.c
>>> index 60ff4b31e320..d2949c378f08 100644
>>> --- a/rdma/dev.c
>>> +++ b/rdma/dev.c
>>> @@ -273,7 +273,7 @@ static int dev_set_name(struct rd *rd)
>>> mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
>>> mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd_argv(rd));
>>>
>>> -   return rd_send_msg(rd);
>>> +   return rd_sendrecv_msg(rd, seq);
>>>  }
>>>
>>>  static int dev_one_set(struct rd *rd)
>>> diff --git a/rdma/rdma.h b/rdma/rdma.h
>>> index 547bb5749a39..20be2f12c4f8 100644
>>> --- a/rdma/rdma.h
>>> +++ b/rdma/rdma.h
>>> @@ -115,6 +115,7 @@ bool rd_check_is_key_exist(struct rd *rd, const char 
>>> *key);
>>>   */
>>>  int rd_send_msg(struct rd *rd);
>>>  int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t 
>>> seq);
>>> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq);
>>>  void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t 
>>> flags);
>>>  int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
>>>  int rd_attr_cb(const struct nlattr *attr, void *data);
>>> diff --git a/rdma/utils.c b/rdma/utils.c
>>> index 069d44fece10..a6f2826c9605 100644
>>> --- a/rdma/utils.c
>>> +++ b/rdma/utils.c
>>> @@ -664,6 +664,27 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void 
>>> *data, unsigned int seq)
>>> return ret;
>>>  }
>>>
>>> +static int null_cb(const struct nlmsghdr *nlh, void *data)
>>> +{
>>> +   return MNL_CB_OK;
>>> +}
>>> +
>>> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq)
>>> +{
>>> +   int ret;
>>> +
>>> +   ret = rd_send_msg(rd);
>>> +   if (ret) {
>>> +   perror(NULL);
>> This is more or less already done in rd_send_msg() and that function
>> prints something in case of execution error. So the missing piece
>> is to update rd_recv_msg(), so all places will "magically" print errors
>> and not only dev_set_name().
>>
>>> +   goto out;
>>> +   }
>>> +   ret = rd_recv_msg(rd, null_cb, rd, seq);
> Will this "null_cb" work for all send/recv flows or only in flows where
> response can be error only? 


Only those flows where no nl attributes are expected to be returned.


> Will we need this recv_msg if we implement
> extack support?


I'm not sure how extack works.  Do you know?

Thanks!

Steve.




Re: [PATCH v1 iproute2-next 2/4] Sync up rdma_netlink.h

2019-02-26 Thread Steve Wise


On 2/23/2019 3:28 AM, Leon Romanovsky wrote:
> On Thu, Feb 21, 2019 at 08:19:07AM -0800, Steve Wise wrote:
>> Pull in the latest rdma_netlink.h to get the RDMA_NLDEV_CMD_NEWLINK /
>> RDMA_NLDEV_CMD_DELLINK API.
>>
>> Signed-off-by: Steve Wise 
>> ---
>>  rdma/include/uapi/rdma/rdma_netlink.h | 74 
>> +++
>>  1 file changed, 58 insertions(+), 16 deletions(-)
>>
> If my series go first, this patch won't be needed.
>
> Thanks,
> Reviewed-by: Leon Romanovsky 

Some changes are still needed because your iproute2 series that was
recently merged didn't include the kernel NEWLINK/DELLINK changes.   But
I've rebased this and all the IWPM changes are already there as part of
your series.

Thanks,

Steve.



Re: [PATCH v1 iproute2-next 1/4] rdma: add helper rd_sendrecv_msg()

2019-02-26 Thread Steve Wise


On 2/23/2019 3:26 AM, Leon Romanovsky wrote:
> On Thu, Feb 21, 2019 at 08:19:03AM -0800, Steve Wise wrote:
>> This function sends the constructed netlink message and then
>> receives the response, displaying any error text.
>>
>> Change 'rdma dev set' to use it.
>>
>> Signed-off-by: Steve Wise 
>> ---
>>  rdma/dev.c   |  2 +-
>>  rdma/rdma.h  |  1 +
>>  rdma/utils.c | 21 +
>>  3 files changed, 23 insertions(+), 1 deletion(-)
>>
>> diff --git a/rdma/dev.c b/rdma/dev.c
>> index 60ff4b31e320..d2949c378f08 100644
>> --- a/rdma/dev.c
>> +++ b/rdma/dev.c
>> @@ -273,7 +273,7 @@ static int dev_set_name(struct rd *rd)
>>  mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
>>  mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd_argv(rd));
>>
>> -return rd_send_msg(rd);
>> +return rd_sendrecv_msg(rd, seq);
>>  }
>>
>>  static int dev_one_set(struct rd *rd)
>> diff --git a/rdma/rdma.h b/rdma/rdma.h
>> index 547bb5749a39..20be2f12c4f8 100644
>> --- a/rdma/rdma.h
>> +++ b/rdma/rdma.h
>> @@ -115,6 +115,7 @@ bool rd_check_is_key_exist(struct rd *rd, const char 
>> *key);
>>   */
>>  int rd_send_msg(struct rd *rd);
>>  int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t seq);
>> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq);
>>  void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t 
>> flags);
>>  int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
>>  int rd_attr_cb(const struct nlattr *attr, void *data);
>> diff --git a/rdma/utils.c b/rdma/utils.c
>> index 069d44fece10..a6f2826c9605 100644
>> --- a/rdma/utils.c
>> +++ b/rdma/utils.c
>> @@ -664,6 +664,27 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void 
>> *data, unsigned int seq)
>>  return ret;
>>  }
>>
>> +static int null_cb(const struct nlmsghdr *nlh, void *data)
>> +{
>> +return MNL_CB_OK;
>> +}
>> +
>> +int rd_sendrecv_msg(struct rd *rd, unsigned int seq)
>> +{
>> +int ret;
>> +
>> +ret = rd_send_msg(rd);
>> +if (ret) {
>> +perror(NULL);
> This is more or less already done in rd_send_msg() and that function
> prints something in case of execution error. So the missing piece
> is to update rd_recv_msg(), so all places will "magically" print errors
> and not only dev_set_name().

Yea ok.




RE: [PATCH iproute2-next v1 19/19] rdma: Provide and reuse filter functions

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:22 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 19/19] rdma: Provide and reuse filter
> functions
> 
> From: Leon Romanovsky 
> 
> Globally replace all filter function in safer variants of those
> is_filterred functions, which take into account the availability/lack

"is_filterred" -> "is_filtered"

> of netlink attributes.
> 
> Such conversion allowed to fix a number of places in the code, where
> the previous implementation didn't honor filter requests if netlink
> attribute wasn't present.
> 
> Signed-off-by: Leon Romanovsky 
> ---
>  rdma/rdma.h |  7 +++---
>  rdma/res-cmid.c | 57 +++--
>  rdma/res-cq.c   | 22 +++
>  rdma/res-mr.c   | 12 +++
>  rdma/res-pd.c   | 12 +++
>  rdma/res-qp.c   | 53 ++---
>  rdma/utils.c| 26 ++
>  7 files changed, 112 insertions(+), 77 deletions(-)

Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 18/19] rdma: Perform single .doit call to query specific objects

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:22 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 18/19] rdma: Perform single .doit call to
> query specific objects
> 
> From: Leon Romanovsky 
> 
> If user provides specific index, we can speedup query
> by using .doit callback and save full dump and filtering
> after that.
> 
> Signed-off-by: Leon Romanovsky 
> ---
>  rdma/rdma.h | 10 +-
>  rdma/res-cmid.c | 33 +--
>  rdma/res-cq.c   | 34 ++--
>  rdma/res-mr.c   | 33 ++-
>  rdma/res-pd.c   | 32 ++-
>  rdma/res-qp.c   | 32 ++-
>  rdma/res.c  | 34 +++-
>  rdma/res.h  | 84 -
>  rdma/utils.c| 20 ++++
>  9 files changed, 239 insertions(+), 73 deletions(-)
> 


Reviewed-by: Steve Wise 





RE: [PATCH iproute2-next v1 17/19] rdma: Unify netlink attribute checks prior to prints

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:22 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 17/19] rdma: Unify netlink attribute
checks
> prior to prints
> 
> From: Leon Romanovsky 
> 
> Place check if netlink attribute available in general place,
> instead of doing the same check in many paces.
> 
> Signed-off-by: Leon Romanovsky 
> ---
>  rdma/res-cmid.c |  9 -
>  rdma/res-cq.c   | 22 +++---
>  rdma/res-mr.c   | 21 +++--
>  rdma/res-pd.c   | 20 +---
>  rdma/res-qp.c   | 10 +-
>  rdma/res.c  | 16 +---
>  rdma/res.h  |  6 ++++--
>  7 files changed, 53 insertions(+), 51 deletions(-)


Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 16/19] rdma: Move QP code to separate function

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:22 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 16/19] rdma: Move QP code to separate
> function
> 
> From: Leon Romanovsky 
> 
> Signed-off-by: Leon Romanovsky 
> ---
>  rdma/res-qp.c | 253 +-
>  1 file changed, 127 insertions(+), 126 deletions(-)


Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 15/19] rdma: Separate PD code

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:22 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 15/19] rdma: Separate PD code
> 
> From: Leon Romanovsky 
> 
> Signed-off-by: Leon Romanovsky 
> ---
>  rdma/res-pd.c | 170 ++
>  1 file changed, 89 insertions(+), 81 deletions(-)

commit text is wrong.

Reviewed-by: Steve Wise 





RE: [PATCH iproute2-next v1 13/19] rdma: Refactor CQ prints

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:22 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 13/19] rdma: Refactor CQ prints
> 
> From: Leon Romanovsky 
> 
> Signed-off-by: Leon Romanovsky 
> ---
>  rdma/res-cq.c | 177 ++
>  1 file changed, 91 insertions(+), 86 deletions(-)


Reviewed-by: Steve Wise 





RE: [PATCH iproute2-next v1 14/19] rdma: Separate MR code

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:22 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 14/19] rdma: Separate MR code
> 
> From: Leon Romanovsky 
> 
> Signed-off-by: Leon Romanovsky 
> ---
>  rdma/res-mr.c | 178 +-
>  1 file changed, 90 insertions(+), 88 deletions(-)

The commit description is wrong.

Reviewed-by: Steve Wise 





RE: [PATCH iproute2-next v1 12/19] rdma: Simplify CM_ID print code

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:21 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 12/19] rdma: Simplify CM_ID print code
> 
> From: Leon Romanovsky 
> 
> Refactor our the CM_ID print code.
> 
> Signed-off-by: Leon Romanovsky 
> ---
>  rdma/res-cmid.c | 246 
>  1 file changed, 124 insertions(+), 122 deletions(-)

Not sure why you refactored, but ok.

Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 11/19] rdma: Simplify code to reuse existing functions

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:21 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 11/19] rdma: Simplify code to reuse
> existing functions
> 
> From: Leon Romanovsky 
> 
> Remove duplicated functions in favour general res_print_uint() call.
> 
> Signed-off-by: Leon Romanovsky 
> ---
>  rdma/res-cmid.c |  4 ++--
>  rdma/res-cq.c   | 14 +++---
>  rdma/res-mr.c   |  2 +-
>  rdma/res-pd.c   |  4 ++--
>  rdma/res-qp.c   | 14 +++---
>  rdma/res.c  | 34 ++
>  rdma/res.h  |  3 ---
>  7 files changed, 13 insertions(+), 62 deletions(-)
> 


Reviewed-by: Steve Wise 





RE: [PATCH iproute2-next v1 10/19] rdma: Properly mark RDMAtool license

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:21 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 10/19] rdma: Properly mark RDMAtool
> license
> 
> From: Leon Romanovsky 
> 
> RDMA subsystem is dual-licensed with "GPL-2.0 OR Linux-OpenIB" proper
> license and Mellanox submission are supposed to have this type of license.
> 
> Signed-off-by: Leon Romanovsky 
> ---
>  rdma/Makefile | 2 +-
>  rdma/dev.c| 7 +--
>  rdma/link.c   | 7 +--
>  rdma/rdma.c   | 7 +--
>  rdma/rdma.h   | 7 +--
>  rdma/res.c| 7 +--
>  rdma/utils.c  | 7 +--
>  7 files changed, 7 insertions(+), 37 deletions(-)


Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 09/19] rdma: Move resource PD logic to separate file

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:21 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 09/19] rdma: Move resource PD logic to
> separate file
> 
> From: Leon Romanovsky 
> 
> Logically separate resource PD logic to separate file,
> in order to make PD specific logic self-contained.
> 

This should be "QP" not "PD"

> Signed-off-by: Leon Romanovsky 
> ---
>  rdma/Makefile |   2 +-
>  rdma/res-qp.c | 234
> +
>  rdma/res.c| 236 --
>  rdma/res.h|  17 
>  4 files changed, 252 insertions(+), 237 deletions(-)
>  create mode 100644 rdma/res-qp.c


Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 08/19] rdma: Move out resource CM-ID logic to separate file

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:21 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 08/19] rdma: Move out resource CM-ID
> logic to separate file
> 
> From: Leon Romanovsky 
> 
> Logically separate resource CM-ID logic to separate file,
> in order to make CM-ID specific logic self-contained.
> 
> Signed-off-by: Leon Romanovsky 

Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 07/19] rdma: Move out resource CQ logic to separate file

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:21 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 07/19] rdma: Move out resource CQ logic
> to separate file
> 
> From: Leon Romanovsky 
> 
> Logically separate resource CQ logic to separate file,
> in order to make CQ specific logic self-contained.
> 
> Signed-off-by: Leon Romanovsky 
> ---

Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 06/19] rdma: Refactor out resource MR logic to separate file

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:21 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 06/19] rdma: Refactor out resource MR
> logic to separate file
> 
> From: Leon Romanovsky 
> 
> Logically separate resource MR logic to separate file,
> in order to make MR specific logic self-contained.
> 
> Signed-off-by: Leon Romanovsky 
> ---

Ah I guess you're moving them all into their own files. Ok.
Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 05/19] rdma: Move resource PD logic to separate file

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:21 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 05/19] rdma: Move resource PD logic to
> separate file
> 
> From: Leon Romanovsky 
> 
> Logically separate resource PD logic to separate file,
> in order to make PD specific logic self-contained.
> 
> Signed-off-by: Leon Romanovsky 
> ---

Why does it need to be self contained?

Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 04/19] rdma: Provide parent context index for all objects except CM_ID

2019-02-21 Thread Steve Wise
"%s 0x%" PRIx64 " ", name, val);
> -}
> -
> -static void res_print_uint(struct rd *rd, const char *name, uint64_t val)
> -{
> - if (rd->json_output)
> - jsonw_uint_field(rd->jw, name, val);
> - else
> - pr_out("%s %" PRIu64 " ", name, val);
> -}
> -
>  static int res_cm_id_parse_cb(const struct nlmsghdr *nlh, void *data)
>  {
>   struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
> @@ -768,6 +776,7 @@ static int res_cq_parse_cb(const struct nlmsghdr *nlh,
> void *data)
>   char *comm = NULL;
>   uint32_t pid = 0;
>   uint8_t poll_ctx = 0;
> + uint32_t ctxn = 0;
>   uint32_t cqn = 0;
>   uint64_t users;
>   uint32_t cqe;
> @@ -815,6 +824,12 @@ static int res_cq_parse_cb(const struct nlmsghdr
> *nlh, void *data)
>   if (rd_check_is_filtered(rd, "cqn", cqn))
>   continue;
> 
> + if (nla_line[RDMA_NLDEV_ATTR_RES_CTXN])
> + ctxn = mnl_attr_get_u32(
> +  nla_line[RDMA_NLDEV_ATTR_RES_CTXN]);
> + if (rd_check_is_filtered(rd, "ctxn", ctxn))
> + continue;
> +
>   if (nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME])
>   /* discard const from mnl_attr_get_str */
>   comm = (char *)mnl_attr_get_str(
> @@ -833,6 +848,8 @@ static int res_cq_parse_cb(const struct nlmsghdr *nlh,
> void *data)
> 
>   if (nla_line[RDMA_NLDEV_ATTR_RES_CQN])
>   res_print_uint(rd, "cqn", cqn);
> + if (nla_line[RDMA_NLDEV_ATTR_RES_CTXN])
> + res_print_uint(rd, "ctxn", ctxn);
> 
>   if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
>   free(comm);
> @@ -866,6 +883,7 @@ static int res_mr_parse_cb(const struct nlmsghdr
> *nlh, void *data)
>   uint32_t rkey = 0, lkey = 0;
>   uint64_t iova = 0, mrlen;
>   char *comm = NULL;
> + uint32_t pdn = 0;
>   uint32_t mrn = 0;
>   uint32_t pid = 0;
>   int err;
> @@ -911,6 +929,12 @@ static int res_mr_parse_cb(const struct nlmsghdr
> *nlh, void *data)
>   if (rd_check_is_filtered(rd, "mrn", mrn))
>   continue;
> 
> + if (nla_line[RDMA_NLDEV_ATTR_RES_PDN])
> + pdn = mnl_attr_get_u32(
> + nla_line[RDMA_NLDEV_ATTR_RES_PDN]);
> + if (rd_check_is_filtered(rd, "pdn", pdn))
> + continue;
> +
>   if (nla_line[RDMA_NLDEV_ATTR_RES_KERN_NAME])
>   /* discard const from mnl_attr_get_str */
>   comm = (char *)mnl_attr_get_str(
> @@ -933,6 +957,9 @@ static int res_mr_parse_cb(const struct nlmsghdr
> *nlh, void *data)
>   if (nla_line[RDMA_NLDEV_ATTR_RES_MRN])
>   res_print_uint(rd, "mrn", mrn);
> 
> + if (nla_line[RDMA_NLDEV_ATTR_RES_PDN])
> + res_print_uint(rd, "pdn", pdn);
> +
>       if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
>   free(comm);
> 
> @@ -964,6 +991,7 @@ static int res_pd_parse_cb(const struct nlmsghdr
> *nlh, void *data)
>   uint32_t local_dma_lkey = 0, unsafe_global_rkey = 0;
>   struct nlattr *nla_line[RDMA_NLDEV_ATTR_MAX] = {};
>   char *comm = NULL;
> + uint32_t ctxn = 0;
>   uint32_t pid = 0;
>   uint32_t pdn = 0;
>   uint64_t users;
> @@ -997,7 +1025,13 @@ static int res_pd_parse_cb(const struct nlmsghdr
> *nlh, void *data)
>   comm = get_task_name(pid);
>   }
> 
> - if (rd_check_is_filtered(rd, "pid", pid))
> +if (rd_check_is_filtered(rd, "pid", pid))

Did this add some whitespace problem?

Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 02/19] rdma: Remove duplicated print code

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:21 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 02/19] rdma: Remove duplicated print
> code
> 
> From: Leon Romanovsky 
> 
> There is no need to keep same print functions for
> uint32_t and uint64_t, unify them into one function.
> 
> Signed-off-by: Leon Romanovsky 

Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 03/19] rdma: Provide unique indexes for all visible objects

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:21 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 03/19] rdma: Provide unique indexes for
all
> visible objects
> 
> From: Leon Romanovsky 
> 
> Signed-off-by: Leon Romanovsky 
> ---

Reviewed-by: Steve Wise 




RE: [PATCH iproute2-next v1 01/19] rdma: update uapi headers

2019-02-21 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Wednesday, February 20, 2019 1:21 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list  r...@vger.kernel.org>; Stephen Hemminger
> 
> Subject: [PATCH iproute2-next v1 01/19] rdma: update uapi headers
> 
> From: Leon Romanovsky 
> 
> Update rdma_netlink,h file upto kernel commit

Nit:  should be "rdma_netlink.h file up to"

> f2a0e45f36b0 RDMA/nldev: Don't expose number of not-visible entries
> 
> Signed-off-by: Leon Romanovsky 

Reviewed-by: Steve Wise 




[PATCH v1 iproute2-next 3/4] rdma: add 'link add/delete' commands

2019-02-21 Thread Steve Wise
Add new 'link' subcommand 'add' and 'delete' to allow binding a soft-rdma
device to a netdev interface.

EG:

rdma link add rxe_eth0 type rxe netdev eth0
rdma link delete rxe_eth0

Signed-off-by: Steve Wise 
---
 rdma/link.c  | 67 
 rdma/rdma.h  |  1 +
 rdma/utils.c |  2 +-
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/rdma/link.c b/rdma/link.c
index c064be627be2..afaf19663728 100644
--- a/rdma/link.c
+++ b/rdma/link.c
@@ -14,6 +14,9 @@
 static int link_help(struct rd *rd)
 {
pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename);
+   pr_out("Usage: %s link add NAME type TYPE netdev NETDEV\n",
+  rd->filename);
+   pr_out("Usage: %s link delete NAME\n", rd->filename);
return 0;
 }
 
@@ -341,10 +344,74 @@ static int link_show(struct rd *rd)
return rd_exec_link(rd, link_one_show, true);
 }
 
+static int link_add(struct rd *rd)
+{
+   char *name;
+   char *type = NULL;
+   char *dev = NULL;
+   uint32_t seq;
+
+   if (rd_no_arg(rd)) {
+   pr_err("No link name was supplied\n");
+   return -EINVAL;
+   }
+   name = rd_argv(rd);
+   rd_arg_inc(rd);
+   while (!rd_no_arg(rd)) {
+   if (rd_argv_match(rd, "type")) {
+   rd_arg_inc(rd);
+   type = rd_argv(rd);
+   } else if (rd_argv_match(rd, "netdev")) {
+   rd_arg_inc(rd);
+   dev = rd_argv(rd);
+   } else {
+   pr_err("Invalid parameter %s\n", rd_argv(rd));
+   return -EINVAL;
+   }
+   rd_arg_inc(rd);
+   }
+   if (!type) {
+   pr_err("No type was supplied\n");
+   return -EINVAL;
+   }
+   if (!dev) {
+   pr_err("No net device was supplied\n");
+   return -EINVAL;
+   }
+
+   rd_prepare_msg(rd, RDMA_NLDEV_CMD_NEWLINK, &seq,
+  (NLM_F_REQUEST | NLM_F_ACK));
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, name);
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_LINK_TYPE, type);
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_NDEV_NAME, dev);
+   return rd_sendrecv_msg(rd, seq);
+}
+
+static int _link_del(struct rd *rd)
+{
+   uint32_t seq;
+
+   if (!rd_no_arg(rd)) {
+   pr_err("Unknown parameter %s\n", rd_argv(rd));
+   return -EINVAL;
+   }
+   rd_prepare_msg(rd, RDMA_NLDEV_CMD_DELLINK, &seq,
+  (NLM_F_REQUEST | NLM_F_ACK));
+   mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
+   return rd_sendrecv_msg(rd, seq);
+}
+
+static int link_del(struct rd *rd)
+{
+   return rd_exec_require_dev(rd, _link_del);
+}
+
 int cmd_link(struct rd *rd)
 {
const struct rd_cmd cmds[] = {
{ NULL, link_show },
+   { "add",link_add },
+   { "delete", link_del },
{ "show",   link_show },
{ "list",   link_show },
{ "help",   link_help },
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 20be2f12c4f8..af4597f45640 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -79,6 +79,7 @@ struct rd_cmd {
  */
 bool rd_no_arg(struct rd *rd);
 void rd_arg_inc(struct rd *rd);
+bool rd_argv_match(struct rd *rd, const char *pattern);
 
 char *rd_argv(struct rd *rd);
 
diff --git a/rdma/utils.c b/rdma/utils.c
index a6f2826c9605..85a954167511 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -32,7 +32,7 @@ int strcmpx(const char *str1, const char *str2)
return strncmp(str1, str2, strlen(str1));
 }
 
-static bool rd_argv_match(struct rd *rd, const char *pattern)
+bool rd_argv_match(struct rd *rd, const char *pattern)
 {
if (!rd_argc(rd))
return false;
-- 
1.8.3.1



[PATCH v1 iproute2-next 4/4] rdma: man page update for link add/delete

2019-02-21 Thread Steve Wise
Update the 'rdma link' man page with 'link add/delete' info.

Signed-off-by: Steve Wise 
---
 man/man8/rdma-link.8 | 47 +++
 1 file changed, 47 insertions(+)

diff --git a/man/man8/rdma-link.8 b/man/man8/rdma-link.8
index bddf34746e8b..b3b40de75852 100644
--- a/man/man8/rdma-link.8
+++ b/man/man8/rdma-link.8
@@ -23,6 +23,18 @@ rdma-link \- rdma link configuration
 .RI "[ " DEV/PORT_INDEX " ]"
 
 .ti -8
+.B rdma link add
+.BR NAME
+.BR type
+.BR TYPE
+.BR netdev
+.BR NETDEV
+
+.ti -8
+.B rdma link delete
+.RI NAME
+
+.ti -8
 .B rdma link help
 
 .SH "DESCRIPTION"
@@ -33,6 +45,31 @@ rdma-link \- rdma link configuration
 - specifies the RDMA link to show.
 If this argument is omitted all links are listed.
 
+.SS rdma link add NAME type TYPE netdev NETDEV - add an rdma link for the 
specified type to the network device
+.sp
+.BR NAME
+- specifies the new name of the rdma link to add
+
+.BR TYPE
+- specifies which rdma type to use.  Link types:
+.sp
+.in +8
+.B rxe
+- Soft RoCE driver
+.sp
+.B siw
+- Soft iWARP driver
+.in -8
+
+.BR NETDEV
+- specifies the network device to which the link is bound
+
+.SS rdma link delete NAME - delete an rdma link
+.PP
+.BR NAME
+- specifies the name of the rdma link to delete
+.PP
+
 .SH "EXAMPLES"
 .PP
 rdma link show
@@ -45,6 +82,16 @@ rdma link show mlx5_2/1
 Shows the state of specified rdma link.
 .RE
 .PP
+rdma link add rxe_eth0 type rxe netdev eth0
+.RS 4
+Adds a RXE link named rxe_eth0 to network device eth0
+.RE
+.PP
+rdma link del rxe_eth0
+.RS 4
+Removes RXE link rxe_eth0
+.RE
+.PP
 
 .SH SEE ALSO
 .BR rdma (8),
-- 
1.8.3.1



[PATCH v1 iproute2-next 1/4] rdma: add helper rd_sendrecv_msg()

2019-02-21 Thread Steve Wise
This function sends the constructed netlink message and then
receives the response, displaying any error text.

Change 'rdma dev set' to use it.

Signed-off-by: Steve Wise 
---
 rdma/dev.c   |  2 +-
 rdma/rdma.h  |  1 +
 rdma/utils.c | 21 +
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/rdma/dev.c b/rdma/dev.c
index 60ff4b31e320..d2949c378f08 100644
--- a/rdma/dev.c
+++ b/rdma/dev.c
@@ -273,7 +273,7 @@ static int dev_set_name(struct rd *rd)
mnl_attr_put_u32(rd->nlh, RDMA_NLDEV_ATTR_DEV_INDEX, rd->dev_idx);
mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, rd_argv(rd));
 
-   return rd_send_msg(rd);
+   return rd_sendrecv_msg(rd, seq);
 }
 
 static int dev_one_set(struct rd *rd)
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 547bb5749a39..20be2f12c4f8 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -115,6 +115,7 @@ bool rd_check_is_key_exist(struct rd *rd, const char *key);
  */
 int rd_send_msg(struct rd *rd);
 int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void *data, uint32_t seq);
+int rd_sendrecv_msg(struct rd *rd, unsigned int seq);
 void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t 
flags);
 int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
 int rd_attr_cb(const struct nlattr *attr, void *data);
diff --git a/rdma/utils.c b/rdma/utils.c
index 069d44fece10..a6f2826c9605 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -664,6 +664,27 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void 
*data, unsigned int seq)
return ret;
 }
 
+static int null_cb(const struct nlmsghdr *nlh, void *data)
+{
+   return MNL_CB_OK;
+}
+
+int rd_sendrecv_msg(struct rd *rd, unsigned int seq)
+{
+   int ret;
+
+   ret = rd_send_msg(rd);
+   if (ret) {
+   perror(NULL);
+   goto out;
+   }
+   ret = rd_recv_msg(rd, null_cb, rd, seq);
+   if (ret)
+   perror(NULL);
+out:
+   return ret;
+}
+
 static struct dev_map *_dev_map_lookup(struct rd *rd, const char *dev_name)
 {
struct dev_map *dev_map;
-- 
1.8.3.1



[PATCH v1 iproute2-next 2/4] Sync up rdma_netlink.h

2019-02-21 Thread Steve Wise
Pull in the latest rdma_netlink.h to get the RDMA_NLDEV_CMD_NEWLINK /
RDMA_NLDEV_CMD_DELLINK API.

Signed-off-by: Steve Wise 
---
 rdma/include/uapi/rdma/rdma_netlink.h | 74 +++
 1 file changed, 58 insertions(+), 16 deletions(-)

diff --git a/rdma/include/uapi/rdma/rdma_netlink.h 
b/rdma/include/uapi/rdma/rdma_netlink.h
index 04c80cebef49..23a90ad52485 100644
--- a/rdma/include/uapi/rdma/rdma_netlink.h
+++ b/rdma/include/uapi/rdma/rdma_netlink.h
@@ -5,8 +5,7 @@
 #include 
 
 enum {
-   RDMA_NL_RDMA_CM = 1,
-   RDMA_NL_IWCM,
+   RDMA_NL_IWCM = 2,
RDMA_NL_RSVD,
RDMA_NL_LS, /* RDMA Local Services */
RDMA_NL_NLDEV,  /* RDMA device interface */
@@ -14,8 +13,7 @@ enum {
 };
 
 enum {
-   RDMA_NL_GROUP_CM = 1,
-   RDMA_NL_GROUP_IWPM,
+   RDMA_NL_GROUP_IWPM = 2,
RDMA_NL_GROUP_LS,
RDMA_NL_NUM_GROUPS
 };
@@ -24,15 +22,17 @@ enum {
 #define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1))
 #define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op)
 
-enum {
-   RDMA_NL_RDMA_CM_ID_STATS = 0,
-   RDMA_NL_RDMA_CM_NUM_OPS
-};
+/* The minimum version that the iwpm kernel supports */
+#define IWPM_UABI_VERSION_MIN  3
+
+/* The latest version that the iwpm kernel supports */
+#define IWPM_UABI_VERSION  4
 
+/* iwarp port mapper message flags */
 enum {
-   RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1,
-   RDMA_NL_RDMA_CM_ATTR_DST_ADDR,
-   RDMA_NL_RDMA_CM_NUM_ATTR,
+
+   /* Do not map the port for this IWPM request */
+   IWPM_FLAGS_NO_PORT_MAP = (1 << 0),
 };
 
 /* iwarp port mapper op-codes */
@@ -45,6 +45,7 @@ enum {
RDMA_NL_IWPM_HANDLE_ERR,
RDMA_NL_IWPM_MAPINFO,
RDMA_NL_IWPM_MAPINFO_NUM,
+   RDMA_NL_IWPM_HELLO,
RDMA_NL_IWPM_NUM_OPS
 };
 
@@ -83,20 +84,38 @@ enum {
IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0,
IWPM_NLA_MANAGE_MAPPING_SEQ,
IWPM_NLA_MANAGE_ADDR,
-   IWPM_NLA_MANAGE_MAPPED_LOC_ADDR,
+   IWPM_NLA_MANAGE_FLAGS,
+   IWPM_NLA_MANAGE_MAPPING_MAX
+};
+
+enum {
+   IWPM_NLA_RMANAGE_MAPPING_UNSPEC = 0,
+   IWPM_NLA_RMANAGE_MAPPING_SEQ,
+   IWPM_NLA_RMANAGE_ADDR,
+   IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR,
+   /* The following maintains bisectability of rdma-core */
+   IWPM_NLA_MANAGE_MAPPED_LOC_ADDR = IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR,
IWPM_NLA_RMANAGE_MAPPING_ERR,
IWPM_NLA_RMANAGE_MAPPING_MAX
 };
 
-#define IWPM_NLA_MANAGE_MAPPING_MAX 3
-#define IWPM_NLA_QUERY_MAPPING_MAX  4
 #define IWPM_NLA_MAPINFO_SEND_MAX   3
+#define IWPM_NLA_REMOVE_MAPPING_MAX 3
 
 enum {
IWPM_NLA_QUERY_MAPPING_UNSPEC = 0,
IWPM_NLA_QUERY_MAPPING_SEQ,
IWPM_NLA_QUERY_LOCAL_ADDR,
IWPM_NLA_QUERY_REMOTE_ADDR,
+   IWPM_NLA_QUERY_FLAGS,
+   IWPM_NLA_QUERY_MAPPING_MAX,
+};
+
+enum {
+   IWPM_NLA_RQUERY_MAPPING_UNSPEC = 0,
+   IWPM_NLA_RQUERY_MAPPING_SEQ,
+   IWPM_NLA_RQUERY_LOCAL_ADDR,
+   IWPM_NLA_RQUERY_REMOTE_ADDR,
IWPM_NLA_RQUERY_MAPPED_LOC_ADDR,
IWPM_NLA_RQUERY_MAPPED_REM_ADDR,
IWPM_NLA_RQUERY_MAPPING_ERR,
@@ -114,6 +133,7 @@ enum {
IWPM_NLA_MAPINFO_UNSPEC = 0,
IWPM_NLA_MAPINFO_LOCAL_ADDR,
IWPM_NLA_MAPINFO_MAPPED_ADDR,
+   IWPM_NLA_MAPINFO_FLAGS,
IWPM_NLA_MAPINFO_MAX
 };
 
@@ -132,6 +152,12 @@ enum {
IWPM_NLA_ERR_MAX
 };
 
+enum {
+   IWPM_NLA_HELLO_UNSPEC = 0,
+   IWPM_NLA_HELLO_ABI_VERSION,
+   IWPM_NLA_HELLO_MAX
+};
+
 /*
  * Local service operations:
  *   RESOLVE - The client requests the local service to resolve a path.
@@ -229,9 +255,11 @@ enum rdma_nldev_command {
RDMA_NLDEV_CMD_GET, /* can dump */
RDMA_NLDEV_CMD_SET,
 
-   /* 3 - 4 are free to use */
+   RDMA_NLDEV_CMD_NEWLINK,
 
-   RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */
+   RDMA_NLDEV_CMD_DELLINK,
+
+   RDMA_NLDEV_CMD_PORT_GET, /* can dump */
 
/* 6 - 8 are free to use */
 
@@ -431,6 +459,20 @@ enum rdma_nldev_attr {
RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */
 
/*
+* Indexes to get/set secific entry,
+* for QP use RDMA_NLDEV_ATTR_RES_LQPN
+*/
+   RDMA_NLDEV_ATTR_RES_PDN,   /* u32 */
+   RDMA_NLDEV_ATTR_RES_CQN,   /* u32 */
+   RDMA_NLDEV_ATTR_RES_MRN,   /* u32 */
+   RDMA_NLDEV_ATTR_RES_CM_IDN,/* u32 */
+   RDMA_NLDEV_ATTR_RES_CTXN,  /* u32 */
+   /*
+* Identifies the rdma driver. eg: "rxe" or "siw"
+*/
+   RDMA_NLDEV_ATTR_LINK_TYPE,  /* string */
+
+   /*
 * Always the end
 */
RDMA_NLDEV_ATTR_MAX
-- 
1.8.3.1



[PATCH v1 iproute2-next 0/4] Dynamic rdma link creation

2019-02-21 Thread Steve Wise
This series adds rdmatool support for creating/deleting rdma links.
This will be used, mainly, by soft rdma drivers to allow adding/deleting
rdma links over netdev interfaces.  It provides the user side for
the following kernel changes merged in linux-rdma/for-next:

https://www.spinics.net/lists/linux-rdma/msg75609.html

I believe this series is ready to go.  Please review!

Changes since RFC:

- add rd_sendrecv_msg() and make use of it in dev_set as well
  as the new link commands.
- fixed problems with the man pages
- changed the command line to use "netdev" as the keyword
  for the network device, do avoid confused with the ib_device
  name.
- got rid of the "type" parameter for link delete.  Also pass
  down the device index instead of the name, using the common
  rd services for validating the device name and fetching the
  index.

Thanks,

Steve.

Steve Wise (4):
  rdma: add helper rd_sendrecv_msg()
  Sync up rdma_netlink.h
  rdma: add 'link add/delete' commands
  rdma: man page update for link add/delete

 man/man8/rdma-link.8  | 47 ++
 rdma/dev.c|  2 +-
 rdma/include/uapi/rdma/rdma_netlink.h | 74 +++
 rdma/link.c   | 67 +++
 rdma/rdma.h   |  2 +
 rdma/utils.c  | 23 ++-
 6 files changed, 197 insertions(+), 18 deletions(-)

-- 
1.8.3.1



RE: [rdma-rc PATCH 2/2] iw_cxgb4: cq/qp mask depends on bar2 pages in a host page

2019-02-14 Thread Steve Wise



> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Raju Rangoju
> Sent: Thursday, February 14, 2019 11:28 AM
> To: Jason Gunthorpe 
> Cc: da...@davemloft.net; linux-r...@vger.kernel.org;
> netdev@vger.kernel.org; sw...@opengridcomputing.com
> Subject: Re: [rdma-rc PATCH 2/2] iw_cxgb4: cq/qp mask depends on bar2
> pages in a host page
> 
> On Thursday, February 02/14/19, 2019 at 15:41:34 +, Jason Gunthorpe
> wrote:
> > On Thu, Feb 14, 2019 at 05:40:54PM +0530, Raju Rangoju wrote:
> > > Adjust the cq/qp mask based on no.of bar2 pages in a host page.
> > >
> > > For user-mode rdma, the granularity of the BAR2 memory mapped
> > > to a user rdma process during queue allocation must be based
> > > on the host page size. The lld attributes udb_density and
> > > ucq_density are used to figure out how many sge contexts are
> > > in a bar2 page. So the rdev->qpmask and rdev->cqmask in
> > > iw_cxgb4 need to now be adjusted based on how many sge bar2
> > > pages are in a host page.
> >
> > Why is this rc? Do certain arches fail to work or something?
> >
> 
> Yes, this series fixes a regression that was introduced by commit
> 2391b0030e (v5.0-rc1~129^2~272)
> 
> > Jason

Rdma over cxgb4 on arches with a non-4K page size are busted w/o this fix.
That was the motivation for -rc.





Re: [PATCH RFC iproute2-next 1/2] rdma: add 'link add/delete' commands

2018-11-28 Thread Steve Wise



On 11/28/2018 4:25 PM, Jason Gunthorpe wrote:
> On Wed, Nov 28, 2018 at 04:21:48PM -0600, Steve Wise wrote:
>
>>>> It does make sense to not require type.  The name must be unique so that
>>>> should be enough.  I'll have to respin the kernel side though...
>>> The delete_link really should be an operation on the ib_device, not
>>> the link_ops thing. 
>>>
>>> That directly prevents mis-matching function callbacks..
>>>
>>> Jason
>> Looking at the rtnetlink newlink/dellink, I see they cache the link_ops
>> ptr in the net_device struct.  So when the link is deleted, then
>> appropriate driver-specific dellink function can be called after finding
>> the device to be deleted.  Should I do something along these lines?  IE
>> add a struct rdma_link_ops pointer to struct ib_device.
> I don't see a problem with that either..
>
> Jason

Ok, I'll respin the kernel and user patches tomorrow.  Thanks!




Re: [PATCH RFC iproute2-next 1/2] rdma: add 'link add/delete' commands

2018-11-28 Thread Steve Wise



On 11/28/2018 4:17 PM, Jason Gunthorpe wrote:
> On Wed, Nov 28, 2018 at 02:18:55PM -0600, Steve Wise wrote:
>>
>> On 11/28/2018 2:13 PM, Leon Romanovsky wrote:
>>> On Wed, Nov 28, 2018 at 02:07:29PM -0600, Steve Wise wrote:
>>>> On 11/28/2018 2:04 PM, Leon Romanovsky wrote:
>>>>> On Wed, Nov 28, 2018 at 01:08:05PM -0600, Steve Wise wrote:
>>>>>> On 11/28/2018 12:26 PM, Leon Romanovsky wrote:
>>>>>>> On Thu, Sep 13, 2018 at 10:19:21AM -0700, Steve Wise wrote:
>>>>>>>> Add new 'link' subcommand 'add' and 'delete' to allow binding a 
>>>>>>>> soft-rdma
>>>>>>>> device to a netdev interface.
>>>>>>>>
>>>>>>>> EG:
>>>>>>>>
>>>>>>>> rdma link add rxe_eth0 type rxe dev eth0
>>>>>>>> rdma link delete rxe_eth0
>>>>>>>>
>>>>>>>> Signed-off-by: Steve Wise 
>>>>>>>>  rdma/link.c  | 106 
>>>>>>>> +++
>>>>>>>>  rdma/rdma.h  |   1 +
>>>>>>>>  rdma/utils.c |   2 +-
>>>>>>>>  3 files changed, 108 insertions(+), 1 deletion(-)
>>>>>>>>
>>>>>>>> diff --git a/rdma/link.c b/rdma/link.c
>>>>>>>> index 7a6d4b7e356d..d4f76b0ce11f 100644
>>>>>>>> +++ b/rdma/link.c
>>>>>>>> @@ -14,6 +14,8 @@
>>>>>>>>  static int link_help(struct rd *rd)
>>>>>>>>  {
>>>>>>>>pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename);
>>>>>>>> +  pr_out("Usage: %s link add NAME type TYPE dev DEV\n", 
>>>>>>>> rd->filename);
>>>>>>> I suggest to rename "dev" to be "netdev", because we are using "dev" for
>>>>>>> ib devices.
>>>>>> Yea ok.
>>>>>>
>>>>>>>> +  pr_out("Usage: %s link delete NAME type TYPE\n", rd->filename);
>>>>>>> Why do you need "type" for "delete" command?
>>>>>> Because the type is used in the kernel to find the appropriate link
>>>>>> ops.  I could change the kernel side to search all types for the device
>>>>>> name to delete? 
>>>>> I would say, yes.
>>>>> It makes "delete" operation more natural.
>>>>>
>>>>> Thanks
>>>> Perhaps.
>>>>
>>>> Note: 'ip link delete' takes a type as well...
>>> According to man section, yes.
>>> According to various guides, no.
>>> https://docs.fedoraproject.org/en-US/Fedora/20/html/Networking_Guide/sec-Configure_802_1Q_VLAN_Tagging_ip_Commands.html
>>>
>>> Thanks
>> It does make sense to not require type.  The name must be unique so that
>> should be enough.  I'll have to respin the kernel side though...
> The delete_link really should be an operation on the ib_device, not
> the link_ops thing. 
>
> That directly prevents mis-matching function callbacks..
>
> Jason
Looking at the rtnetlink newlink/dellink, I see they cache the link_ops
ptr in the net_device struct.  So when the link is deleted, then
appropriate driver-specific dellink function can be called after finding
the device to be deleted.  Should I do something along these lines?  IE
add a struct rdma_link_ops pointer to struct ib_device.

Steve.


Re: [PATCH RFC iproute2-next 1/2] rdma: add 'link add/delete' commands

2018-11-28 Thread Steve Wise



On 11/28/2018 2:13 PM, Leon Romanovsky wrote:
> On Wed, Nov 28, 2018 at 02:07:29PM -0600, Steve Wise wrote:
>>
>> On 11/28/2018 2:04 PM, Leon Romanovsky wrote:
>>> On Wed, Nov 28, 2018 at 01:08:05PM -0600, Steve Wise wrote:
>>>> On 11/28/2018 12:26 PM, Leon Romanovsky wrote:
>>>>> On Thu, Sep 13, 2018 at 10:19:21AM -0700, Steve Wise wrote:
>>>>>> Add new 'link' subcommand 'add' and 'delete' to allow binding a soft-rdma
>>>>>> device to a netdev interface.
>>>>>>
>>>>>> EG:
>>>>>>
>>>>>> rdma link add rxe_eth0 type rxe dev eth0
>>>>>> rdma link delete rxe_eth0
>>>>>>
>>>>>> Signed-off-by: Steve Wise 
>>>>>> ---
>>>>>>  rdma/link.c  | 106 
>>>>>> +++
>>>>>>  rdma/rdma.h  |   1 +
>>>>>>  rdma/utils.c |   2 +-
>>>>>>  3 files changed, 108 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/rdma/link.c b/rdma/link.c
>>>>>> index 7a6d4b7e356d..d4f76b0ce11f 100644
>>>>>> --- a/rdma/link.c
>>>>>> +++ b/rdma/link.c
>>>>>> @@ -14,6 +14,8 @@
>>>>>>  static int link_help(struct rd *rd)
>>>>>>  {
>>>>>>  pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename);
>>>>>> +pr_out("Usage: %s link add NAME type TYPE dev DEV\n", 
>>>>>> rd->filename);
>>>>> I suggest to rename "dev" to be "netdev", because we are using "dev" for
>>>>> ib devices.
>>>> Yea ok.
>>>>
>>>>>> +pr_out("Usage: %s link delete NAME type TYPE\n", rd->filename);
>>>>> Why do you need "type" for "delete" command?
>>>> Because the type is used in the kernel to find the appropriate link
>>>> ops.  I could change the kernel side to search all types for the device
>>>> name to delete? 
>>> I would say, yes.
>>> It makes "delete" operation more natural.
>>>
>>> Thanks
>> Perhaps.
>>
>> Note: 'ip link delete' takes a type as well...
> According to man section, yes.
> According to various guides, no.
> https://docs.fedoraproject.org/en-US/Fedora/20/html/Networking_Guide/sec-Configure_802_1Q_VLAN_Tagging_ip_Commands.html
>
> Thanks

It does make sense to not require type.  The name must be unique so that
should be enough.  I'll have to respin the kernel side though...

Thanks,

Steve.


Re: [PATCH RFC iproute2-next 1/2] rdma: add 'link add/delete' commands

2018-11-28 Thread Steve Wise



On 11/28/2018 2:08 PM, Leon Romanovsky wrote:
> On Wed, Nov 28, 2018 at 10:02:04PM +0200, Leon Romanovsky wrote:
>> On Wed, Nov 28, 2018 at 01:34:14PM -0600, Steve Wise wrote:
>>> ...
>>>
>>>>>> +rd_prepare_msg(rd, RDMA_NLDEV_CMD_NEWLINK, &seq,
>>>>>> +   (NLM_F_REQUEST | NLM_F_ACK));
>>>>>> +mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, name);
>>>>>> +mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_LINK_TYPE, type);
>>>>>> +mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_NDEV_NAME, dev);
>>>>>> +ret = rd_send_msg(rd);
>>>>>> +if (ret)
>>>>>> +return ret;
>>>>>> +
>>>>>> +ret = rd_recv_msg(rd, link_add_parse_cb, rd, seq);
>>>>>> +if (ret)
>>>>>> +perror(NULL);
>>>>> Why do you need rd_recv_msg()? I think that it is not needed, at least
>>>>> for rename, I didn't need it.
>>>>> https://git.kernel.org/pub/scm/network/iproute2/iproute2-next.git/tree/rdma/dev.c#n244
>>>> To get the response of if it was successfully added.  It provides the
>>>> errno value.
>>> If I don't do the rd_recv_msg, then adding the same name twice fails
>>> without any error notification.  Ditto for deleting a non-existent
>>> link.  So the rd_recv_msg() allows getting the failure reason (and
>>> detecting the failure). 
>>>
>> Shouldn't extack provide such information as part of NLM_F_ACK flag?
>>
>> just shooting into the air, will take more close look tomorrow.
> OK, it was easier than I thought.
>
> You are right, need both send and receive to get the reason.
>
> Can you prepare general function and update rename part too?
> Something like send_receive(...) with dummy callback for receive path.
>
> Thanks

Sure, I'll whip something up for the next version of the patch series...



Re: [PATCH RFC iproute2-next 1/2] rdma: add 'link add/delete' commands

2018-11-28 Thread Steve Wise



On 11/28/2018 2:04 PM, Leon Romanovsky wrote:
> On Wed, Nov 28, 2018 at 01:08:05PM -0600, Steve Wise wrote:
>>
>> On 11/28/2018 12:26 PM, Leon Romanovsky wrote:
>>> On Thu, Sep 13, 2018 at 10:19:21AM -0700, Steve Wise wrote:
>>>> Add new 'link' subcommand 'add' and 'delete' to allow binding a soft-rdma
>>>> device to a netdev interface.
>>>>
>>>> EG:
>>>>
>>>> rdma link add rxe_eth0 type rxe dev eth0
>>>> rdma link delete rxe_eth0
>>>>
>>>> Signed-off-by: Steve Wise 
>>>> ---
>>>>  rdma/link.c  | 106 
>>>> +++
>>>>  rdma/rdma.h  |   1 +
>>>>  rdma/utils.c |   2 +-
>>>>  3 files changed, 108 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/rdma/link.c b/rdma/link.c
>>>> index 7a6d4b7e356d..d4f76b0ce11f 100644
>>>> --- a/rdma/link.c
>>>> +++ b/rdma/link.c
>>>> @@ -14,6 +14,8 @@
>>>>  static int link_help(struct rd *rd)
>>>>  {
>>>>pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename);
>>>> +  pr_out("Usage: %s link add NAME type TYPE dev DEV\n", rd->filename);
>>> I suggest to rename "dev" to be "netdev", because we are using "dev" for
>>> ib devices.
>> Yea ok.
>>
>>>> +  pr_out("Usage: %s link delete NAME type TYPE\n", rd->filename);
>>> Why do you need "type" for "delete" command?
>> Because the type is used in the kernel to find the appropriate link
>> ops.  I could change the kernel side to search all types for the device
>> name to delete? 
> I would say, yes.
> It makes "delete" operation more natural.
>
> Thanks

Perhaps.

Note: 'ip link delete' takes a type as well...




Re: [PATCH RFC iproute2-next 1/2] rdma: add 'link add/delete' commands

2018-11-28 Thread Steve Wise
...

>>> +   rd_prepare_msg(rd, RDMA_NLDEV_CMD_NEWLINK, &seq,
>>> +  (NLM_F_REQUEST | NLM_F_ACK));
>>> +   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, name);
>>> +   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_LINK_TYPE, type);
>>> +   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_NDEV_NAME, dev);
>>> +   ret = rd_send_msg(rd);
>>> +   if (ret)
>>> +   return ret;
>>> +
>>> +   ret = rd_recv_msg(rd, link_add_parse_cb, rd, seq);
>>> +   if (ret)
>>> +   perror(NULL);
>> Why do you need rd_recv_msg()? I think that it is not needed, at least
>> for rename, I didn't need it.
>> https://git.kernel.org/pub/scm/network/iproute2/iproute2-next.git/tree/rdma/dev.c#n244
> To get the response of if it was successfully added.  It provides the
> errno value.
If I don't do the rd_recv_msg, then adding the same name twice fails
without any error notification.  Ditto for deleting a non-existent
link.  So the rd_recv_msg() allows getting the failure reason (and
detecting the failure). 




Re: [PATCH RFC iproute2-next 1/2] rdma: add 'link add/delete' commands

2018-11-28 Thread Steve Wise



On 11/28/2018 12:26 PM, Leon Romanovsky wrote:
> On Thu, Sep 13, 2018 at 10:19:21AM -0700, Steve Wise wrote:
>> Add new 'link' subcommand 'add' and 'delete' to allow binding a soft-rdma
>> device to a netdev interface.
>>
>> EG:
>>
>> rdma link add rxe_eth0 type rxe dev eth0
>> rdma link delete rxe_eth0
>>
>> Signed-off-by: Steve Wise 
>> ---
>>  rdma/link.c  | 106 
>> +++
>>  rdma/rdma.h  |   1 +
>>  rdma/utils.c |   2 +-
>>  3 files changed, 108 insertions(+), 1 deletion(-)
>>
>> diff --git a/rdma/link.c b/rdma/link.c
>> index 7a6d4b7e356d..d4f76b0ce11f 100644
>> --- a/rdma/link.c
>> +++ b/rdma/link.c
>> @@ -14,6 +14,8 @@
>>  static int link_help(struct rd *rd)
>>  {
>>  pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename);
>> +pr_out("Usage: %s link add NAME type TYPE dev DEV\n", rd->filename);
> I suggest to rename "dev" to be "netdev", because we are using "dev" for
> ib devices.

Yea ok.

>> +pr_out("Usage: %s link delete NAME type TYPE\n", rd->filename);
> Why do you need "type" for "delete" command?

Because the type is used in the kernel to find the appropriate link
ops.  I could change the kernel side to search all types for the device
name to delete? 

>>  return 0;
>>  }
>>
>> @@ -315,10 +317,114 @@ static int link_show(struct rd *rd)
>>  return rd_exec_link(rd, link_one_show, true);
>>  }
>>
>> +static int link_add_parse_cb(const struct nlmsghdr *nlh, void *data)
>> +{
>> +return MNL_CB_OK;
>> +}
>> +
>> +static int link_add(struct rd *rd)
>> +{
>> +char *name;
>> +char *type = NULL;
>> +char *dev = NULL;
>> +uint32_t seq;
>> +int ret;
>> +
>> +if (rd_no_arg(rd)) {
>> +pr_err("No link name was supplied\n");
>> +return -EINVAL;
>> +}
>> +name = rd_argv(rd);
>> +rd_arg_inc(rd);
>> +while (!rd_no_arg(rd)) {
>> +if (rd_argv_match(rd, "type")) {
>> +rd_arg_inc(rd);
>> +type = rd_argv(rd);
>> +} else if (rd_argv_match(rd, "dev")) {
>> +rd_arg_inc(rd);
>> +dev = rd_argv(rd);
>> +} else {
>> +pr_err("Invalid parameter %s\n", rd_argv(rd));
>> +return -EINVAL;
>> +}
>> +rd_arg_inc(rd);
>> +}
>> +if (!type) {
>> +pr_err("No type was supplied\n");
>> +return -EINVAL;
>> +}
>> +if (!dev) {
>> +pr_err("No net device was supplied\n");
>> +return -EINVAL;
>> +}
>> +
>> +rd_prepare_msg(rd, RDMA_NLDEV_CMD_NEWLINK, &seq,
>> +   (NLM_F_REQUEST | NLM_F_ACK));
>> +mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, name);
>> +mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_LINK_TYPE, type);
>> +mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_NDEV_NAME, dev);
>> +ret = rd_send_msg(rd);
>> +if (ret)
>> +return ret;
>> +
>> +ret = rd_recv_msg(rd, link_add_parse_cb, rd, seq);
>> +if (ret)
>> +perror(NULL);
> Why do you need rd_recv_msg()? I think that it is not needed, at least
> for rename, I didn't need it.
> https://git.kernel.org/pub/scm/network/iproute2/iproute2-next.git/tree/rdma/dev.c#n244

To get the response of if it was successfully added.  It provides the
errno value.

>> +return ret;
>> +}
>> +
>> +static int link_del_parse_cb(const struct nlmsghdr *nlh, void *data)
>> +{
>> +return MNL_CB_OK;
>> +}
>> +
>> +static int link_del(struct rd *rd)
>> +{
>> +char *name;
>> +char *type = NULL;
>> +uint32_t seq;
>> +int ret;
>> +
>> +if (rd_no_arg(rd)) {
>> +pr_err("No link type was supplied\n");
>> +return -EINVAL;
>> +}
>> +name = rd_argv(rd);
>> +rd_arg_inc(rd);
>> +while (!rd_no_arg(rd)) {
>> +if (rd_argv_match(rd, "type")) {
>> +rd_arg_inc(rd);
>> +type = rd_argv(rd);
>> +} else {
>> +pr_

[PATCH RFC iproute2-next 2/2] rdma: man page update for link add/delete

2018-11-28 Thread Steve Wise
Update the 'rdma link' man page with 'link add/delete' info.

Signed-off-by: Steve Wise 
---
 man/man8/rdma-link.8 | 51 +++
 1 file changed, 51 insertions(+)

diff --git a/man/man8/rdma-link.8 b/man/man8/rdma-link.8
index 97dd8bb994d2..b994c326dc34 100644
--- a/man/man8/rdma-link.8
+++ b/man/man8/rdma-link.8
@@ -23,6 +23,20 @@ rdma-link \- rdma link configuration
 .RI "[ " DEV/PORT_INDEX " ]"
 
 .ti -8
+.B rdma link add
+.BR NAME
+.BR type
+.BR TYPE
+.BR dev
+.BR NETDEV
+
+.ti -8
+.B rdma link delete
+.RI NAME
+.BR type
+.BR TYPE
+
+.ti -8
 .B rdma link help
 
 .SH "DESCRIPTION"
@@ -33,6 +47,33 @@ rdma-link \- rdma link configuration
 - specifies the RDMA link to show.
 If this argument is omitted all links are listed.
 
+.SS rdma link add NAME type TYPE dev NETDEV - add an rdma link for the 
specified type to the network device
+.sp
+.BR NAME
+- specifies the new name of the rdma link to add
+
+.BR TYPE
+- specifies which rdma type to use.  Link types:
+.sp
+.in +8
+.B rxe
+- Soft RoCE driver
+.sp
+.B siw
+- Soft iWARP driver
+.in -8
+
+.BR NETDEV
+- specifies the network device to which the link is bound
+
+.SS rdma link delete NAME type TYPE - delete an rdma link for the specified 
type
+.PP
+.BR NAME
+- specifies the name of the rdma link to delete
+.PP
+.BR TYPE
+- specifies which rdma type to use
+
 .SH "EXAMPLES"
 .PP
 rdma link show
@@ -45,6 +86,16 @@ rdma link show mlx5_2/1
 Shows the state of specified rdma link.
 .RE
 .PP
+rdma link add siw_eth0 type rxe dev eth0
+.RS 4
+Adds a RXE link named siw_eth0 to network device eth0
+.RE
+.PP
+rdma link del siw_eth0 type rxe
+.RS 4
+Removes RXE link siw_eth0
+.RE
+.PP
 
 .SH SEE ALSO
 .BR rdma (8),
-- 
1.8.3.1



[PATCH RFC iproute2-next 1/2] rdma: add 'link add/delete' commands

2018-11-28 Thread Steve Wise
Add new 'link' subcommand 'add' and 'delete' to allow binding a soft-rdma
device to a netdev interface.

EG:

rdma link add rxe_eth0 type rxe dev eth0
rdma link delete rxe_eth0

Signed-off-by: Steve Wise 
---
 rdma/link.c  | 106 +++
 rdma/rdma.h  |   1 +
 rdma/utils.c |   2 +-
 3 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/rdma/link.c b/rdma/link.c
index 7a6d4b7e356d..d4f76b0ce11f 100644
--- a/rdma/link.c
+++ b/rdma/link.c
@@ -14,6 +14,8 @@
 static int link_help(struct rd *rd)
 {
pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename);
+   pr_out("Usage: %s link add NAME type TYPE dev DEV\n", rd->filename);
+   pr_out("Usage: %s link delete NAME type TYPE\n", rd->filename);
return 0;
 }
 
@@ -315,10 +317,114 @@ static int link_show(struct rd *rd)
return rd_exec_link(rd, link_one_show, true);
 }
 
+static int link_add_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+   return MNL_CB_OK;
+}
+
+static int link_add(struct rd *rd)
+{
+   char *name;
+   char *type = NULL;
+   char *dev = NULL;
+   uint32_t seq;
+   int ret;
+
+   if (rd_no_arg(rd)) {
+   pr_err("No link name was supplied\n");
+   return -EINVAL;
+   }
+   name = rd_argv(rd);
+   rd_arg_inc(rd);
+   while (!rd_no_arg(rd)) {
+   if (rd_argv_match(rd, "type")) {
+   rd_arg_inc(rd);
+   type = rd_argv(rd);
+   } else if (rd_argv_match(rd, "dev")) {
+   rd_arg_inc(rd);
+   dev = rd_argv(rd);
+   } else {
+   pr_err("Invalid parameter %s\n", rd_argv(rd));
+   return -EINVAL;
+   }
+   rd_arg_inc(rd);
+   }
+   if (!type) {
+   pr_err("No type was supplied\n");
+   return -EINVAL;
+   }
+   if (!dev) {
+   pr_err("No net device was supplied\n");
+   return -EINVAL;
+   }
+
+   rd_prepare_msg(rd, RDMA_NLDEV_CMD_NEWLINK, &seq,
+  (NLM_F_REQUEST | NLM_F_ACK));
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, name);
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_LINK_TYPE, type);
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_NDEV_NAME, dev);
+   ret = rd_send_msg(rd);
+   if (ret)
+   return ret;
+
+   ret = rd_recv_msg(rd, link_add_parse_cb, rd, seq);
+   if (ret)
+   perror(NULL);
+   return ret;
+}
+
+static int link_del_parse_cb(const struct nlmsghdr *nlh, void *data)
+{
+   return MNL_CB_OK;
+}
+
+static int link_del(struct rd *rd)
+{
+   char *name;
+   char *type = NULL;
+   uint32_t seq;
+   int ret;
+
+   if (rd_no_arg(rd)) {
+   pr_err("No link type was supplied\n");
+   return -EINVAL;
+   }
+   name = rd_argv(rd);
+   rd_arg_inc(rd);
+   while (!rd_no_arg(rd)) {
+   if (rd_argv_match(rd, "type")) {
+   rd_arg_inc(rd);
+   type = rd_argv(rd);
+   } else {
+   pr_err("Invalid parameter %s\n", rd_argv(rd));
+   return -EINVAL;
+   }
+   rd_arg_inc(rd);
+   }
+   if (!type) {
+   pr_err("No type was supplied\n");
+   return -EINVAL;
+   }
+   rd_prepare_msg(rd, RDMA_NLDEV_CMD_DELLINK, &seq,
+  (NLM_F_REQUEST | NLM_F_ACK));
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_DEV_NAME, name);
+   mnl_attr_put_strz(rd->nlh, RDMA_NLDEV_ATTR_LINK_TYPE, type);
+   ret = rd_send_msg(rd);
+   if (ret)
+   return ret;
+
+   ret = rd_recv_msg(rd, link_del_parse_cb, rd, seq);
+   if (ret)
+   perror(NULL);
+   return ret;
+}
+
 int cmd_link(struct rd *rd)
 {
const struct rd_cmd cmds[] = {
{ NULL, link_show },
+   { "add",link_add },
+   { "delete", link_del },
{ "show",   link_show },
{ "list",   link_show },
{ "help",   link_help },
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 547bb5749a39..b5271e423c10 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -79,6 +79,7 @@ struct rd_cmd {
  */
 bool rd_no_arg(struct rd *rd);
 void rd_arg_inc(struct rd *rd);
+bool rd_argv_match(struct rd *rd, const char *pattern);
 
 char *rd_argv(struct rd *rd);
 
diff --git a/rdma/utils.c b/rdma/utils.c
index 61f4aeb1bcf2..41f8f7391279 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -32,7 +32,7 @@ int st

[PATCH RFC iproute2-next 0/2] Dynamic rdma link creation

2018-11-28 Thread Steve Wise
This series adds rdmatool support for creating/deleting rdma links.
This will be used, mainly, by soft rdma drivers to allow adding/deleting
rdma links over netdev interfaces.

I've tagged this as "RFC" to start reviewing it, and because it is
dependent on kernel changes which are still under review [1].

Please comment!

Thanks,

Steve.

[1] https://www.spinics.net/lists/linux-rdma/msg71469.html

Steve Wise (2):
  rdma: add 'link add/delete' commands
  rdma: man page update for link add/delete

 man/man8/rdma-link.8 |  51 +
 rdma/link.c  | 106 +++
 rdma/rdma.h  |   1 +
 rdma/utils.c |   2 +-
 4 files changed, 159 insertions(+), 1 deletion(-)

-- 
1.8.3.1



RE: [PATCH iproute2-next] rdma: Refresh help section of resource information

2018-11-01 Thread Steve Wise



> -Original Message-
> From: Leon Romanovsky 
> Sent: Thursday, November 1, 2018 3:35 AM
> To: David Ahern 
> Cc: Leon Romanovsky ; netdev
> ; RDMA mailing list ;
> Stephen Hemminger ; Steve Wise
> 
> Subject: [PATCH iproute2-next] rdma: Refresh help section of resource
> information
> 
> From: Leon Romanovsky 
> 
> After commit 4060e4c0d257 ("rdma: Add PD resource tracking
> information"), the resource information shows PDs and MRs,
> but help pages didn't fully reflect it.
> 
> Signed-off-by: Leon Romanovsky 

Oops.  Thanks.  Looks fine.

Reviewed-by: Steve Wise 




Re: [PATCH rdma-next 1/5] RDMA/core: Provide getter and setter to access IB device name

2018-09-20 Thread Steve Wise



On 9/20/2018 6:21 AM, Leon Romanovsky wrote:
> From: Leon Romanovsky 
> 
> Prepare IB device name field to rename operation by ensuring that all
> accesses to it are protected with lock and users don't see part of name.
> 
> The protection is done with global device_lock because it is used in
> allocation and deallocation phases. At this stage, this lock is not
> busy and easily can be moved to be per-device, once it will be needed.
> 
> Signed-off-by: Leon Romanovsky 
> ---
>  drivers/infiniband/core/device.c | 24 +++-
>  include/rdma/ib_verbs.h  |  8 +++-
>  2 files changed, 30 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/infiniband/core/device.c 
> b/drivers/infiniband/core/device.c
> index 5a680a88aa87..3270cde6d806 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -170,6 +170,14 @@ static struct ib_device *__ib_device_get_by_name(const 
> char *name)
>   return NULL;
>  }
> 
> +void ib_device_get_name(struct ib_device *ibdev, char *name)
> +{
> + down_read(&lists_rwsem);
> + strlcpy(name, ibdev->name, IB_DEVICE_NAME_MAX);
> + up_read(&lists_rwsem);
> +}
> +EXPORT_SYMBOL(ib_device_get_name);
> +
>  static int alloc_name(char *name)
>  {
>   unsigned long *inuse;
> @@ -202,6 +210,21 @@ static int alloc_name(char *name)
>   return 0;
>  }
> 
> +int ib_device_alloc_name(struct ib_device *ibdev, const char *pattern)
> +{
> + int ret = 0;
> +
> + mutex_lock(&device_mutex);
> + strlcpy(ibdev->name, pattern, IB_DEVICE_NAME_MAX);
> + if (strchr(ibdev->name, '%'))
> + ret = alloc_name(ibdev->name);
> +
> + mutex_unlock(&device_mutex);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(ib_device_alloc_name);
> +
>  static void ib_device_release(struct device *device)
>  {
>   struct ib_device *dev = container_of(device, struct ib_device, dev);
> @@ -499,7 +522,6 @@ int ib_register_device(struct ib_device *device,
>   ret = alloc_name(device->name);
>   if (ret)
>   goto out;
> - }

I don't think this is correct...


> 
>   if (ib_device_check_mandatory(device)) {
>   ret = -EINVAL;
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index e764ed1f6025..0e7b9854 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -2260,6 +2260,11 @@ struct ib_device {
>   /* Do not access @dma_device directly from ULP nor from HW drivers. */
>   struct device*dma_device;
> 
> + /*
> +  * Do not access @name directly,
> +  * use ib_device_get_name()/ib_device_alloc_name()
> +  * and don't assume that it can't change after access.
> +  */
>   char  name[IB_DEVICE_NAME_MAX];
> 
>   struct list_head  event_handler_list;
> @@ -2638,7 +2643,8 @@ struct ib_device *ib_alloc_device(size_t size);
>  void ib_dealloc_device(struct ib_device *device);
> 
>  void ib_get_device_fw_str(struct ib_device *device, char *str);
> -
> +int ib_device_alloc_name(struct ib_device *ibdev, const char *pattern);
> +void ib_device_get_name(struct ib_device *ibdev, char *name);
>  int ib_register_device(struct ib_device *device,
>  int (*port_callback)(struct ib_device *,
>   u8, struct kobject *));
> --
> 2.14.4
> 


Re: [PATCH] cxgb4: fix abort_req_rss6 struct

2018-09-10 Thread Steve Wise


(sorry for the late reply, I was out all last week)


On 9/5/2018 5:55 PM, Jason Gunthorpe wrote:
> On Fri, Aug 31, 2018 at 11:52:00AM -0700, Steve Wise wrote:
>> Remove the incorrect WR_HDR field which can cause a misinterpretation
>> of this CPL by ULDs.
> 
> What does that mean?
> 

It means iw_cxgb4 doesn't correctly handle incoming ABORT CPL messages
due to this incorrect struct.


> Is this an -rc patch?
> 

Yes. Since it fixes a3cdaa69e4ae ("cxgb4: Adds CPL support for Shared
Receive Queues") which went into 4.19.

Steve.




[PATCH] cxgb4: fix abort_req_rss6 struct

2018-08-31 Thread Steve Wise
Remove the incorrect WR_HDR field which can cause a misinterpretation
of this CPL by ULDs.

Fixes: a3cdaa69e4ae ("cxgb4: Adds CPL support for Shared Receive Queues")
Signed-off-by: Steve Wise 
---

Dave, Doug, and Jason,

I request this merge through the rdma repo since the only user of this
structure is iw_cxgb4.

---
 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h 
b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
index b8f75a22fb6c..f152da1ce046 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
@@ -753,7 +753,6 @@ struct cpl_abort_req_rss {
 };
 
 struct cpl_abort_req_rss6 {
-   WR_HDR;
union opcode_tid ot;
__be32 srqidx_status;
 };
-- 
1.8.3.1



RE: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask

2018-08-17 Thread Steve Wise
> 
> 
> > Hey Sagi,
> >
> > The patch works allowing connections for the various affinity mappings
> below:
> >
> > One comp_vector per core across all cores, starting with numa-local cores:
> 
> Thanks Steve, is this your "Tested by:" tag?

Sure:

Tested-by: Steve Wise 



RE: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask

2018-08-17 Thread Steve Wise
> On 8/16/2018 1:26 PM, Sagi Grimberg wrote:
> >
> >> Let me know if you want me to try this or any particular fix.
> >
> > Steve, can you test this one?
> 
> Yes!  I'll try it out tomorrow.
> 
> Stevo
> 

Hey Sagi,

The patch works allowing connections for the various affinity mappings below:

One comp_vector per core across all cores, starting with numa-local cores:

[ 3798.494963] iw_cxgb4: comp_vector 0, irq 203 mask 0x100
[ 3798.500717] iw_cxgb4: comp_vector 1, irq 204 mask 0x200
[ 3798.506396] iw_cxgb4: comp_vector 2, irq 205 mask 0x400
[ 3798.512043] iw_cxgb4: comp_vector 3, irq 206 mask 0x800
[ 3798.517675] iw_cxgb4: comp_vector 4, irq 207 mask 0x1000
[ 3798.523382] iw_cxgb4: comp_vector 5, irq 208 mask 0x2000
[ 3798.529075] iw_cxgb4: comp_vector 6, irq 209 mask 0x4000
[ 3798.534754] iw_cxgb4: comp_vector 7, irq 210 mask 0x8000
[ 3798.540425] iw_cxgb4: comp_vector 8, irq 211 mask 0x1
[ 3798.545825] iw_cxgb4: comp_vector 9, irq 212 mask 0x2
[ 3798.551231] iw_cxgb4: comp_vector 10, irq 213 mask 0x4
[ 3798.556713] iw_cxgb4: comp_vector 11, irq 214 mask 0x8
[ 3798.562189] iw_cxgb4: comp_vector 12, irq 215 mask 0x10
[ 3798.567755] iw_cxgb4: comp_vector 13, irq 216 mask 0x20
[ 3798.573312] iw_cxgb4: comp_vector 14, irq 217 mask 0x40
[ 3798.578855] iw_cxgb4: comp_vector 15, irq 218 mask 0x80
[ 3798.584384] set->mq_map[0] queue 8 vector 8
[ 3798.588879] set->mq_map[1] queue 9 vector 9
[ 3798.593358] set->mq_map[2] queue 10 vector 10
[ 3798.598008] set->mq_map[3] queue 11 vector 11
[ 3798.602633] set->mq_map[4] queue 12 vector 12
[ 3798.607260] set->mq_map[5] queue 13 vector 13
[ 3798.611872] set->mq_map[6] queue 14 vector 14
[ 3798.616470] set->mq_map[7] queue 15 vector 15
[ 3798.621059] set->mq_map[8] queue 0 vector 0
[ 3798.625460] set->mq_map[9] queue 1 vector 1
[ 3798.629852] set->mq_map[10] queue 2 vector 2
[ 3798.634331] set->mq_map[11] queue 3 vector 3
[ 3798.638796] set->mq_map[12] queue 4 vector 4
[ 3798.643263] set->mq_map[13] queue 5 vector 5
[ 3798.647727] set->mq_map[14] queue 6 vector 6
[ 3798.652197] set->mq_map[15] queue 7 vector 7

One comp_vector per core, but only numa-local cores:

[ 3855.406027] iw_cxgb4: comp_vector 0, irq 203 mask 0x400
[ 3855.411577] iw_cxgb4: comp_vector 1, irq 204 mask 0x800
[ 3855.417057] iw_cxgb4: comp_vector 2, irq 205 mask 0x1000
[ 3855.422618] iw_cxgb4: comp_vector 3, irq 206 mask 0x2000
[ 3855.428176] iw_cxgb4: comp_vector 4, irq 207 mask 0x4000
[ 3855.433731] iw_cxgb4: comp_vector 5, irq 208 mask 0x8000
[ 3855.439293] iw_cxgb4: comp_vector 6, irq 209 mask 0x100
[ 3855.444770] iw_cxgb4: comp_vector 7, irq 210 mask 0x200
[ 3855.450231] iw_cxgb4: comp_vector 8, irq 211 mask 0x400
[ 3855.455691] iw_cxgb4: comp_vector 9, irq 212 mask 0x800
[ 3855.461144] iw_cxgb4: comp_vector 10, irq 213 mask 0x1000
[ 3855.466768] iw_cxgb4: comp_vector 11, irq 214 mask 0x2000
[ 3855.472379] iw_cxgb4: comp_vector 12, irq 215 mask 0x4000
[ 3855.477992] iw_cxgb4: comp_vector 13, irq 216 mask 0x8000
[ 3855.483599] iw_cxgb4: comp_vector 14, irq 217 mask 0x100
[ 3855.489116] iw_cxgb4: comp_vector 15, irq 218 mask 0x200
[ 3855.494644] set->mq_map[0] queue 8 vector 8
[ 3855.499046] set->mq_map[1] queue 9 vector 9
[ 3855.503445] set->mq_map[2] queue 10 vector 10
[ 3855.508025] set->mq_map[3] queue 11 vector 11
[ 3855.512600] set->mq_map[4] queue 12 vector 12
[ 3855.517176] set->mq_map[5] queue 13 vector 13
[ 3855.521750] set->mq_map[6] queue 14 vector 14
[ 3855.526325] set->mq_map[7] queue 15 vector 15
[ 3855.530902] set->mq_map[8] queue 6 vector 6
[ 3855.535306] set->mq_map[9] queue 7 vector 7
[ 3855.539703] set->mq_map[10] queue 0 vector 0
[ 3855.544197] set->mq_map[11] queue 1 vector 1
[ 3855.548670] set->mq_map[12] queue 2 vector 2
[ 3855.553144] set->mq_map[13] queue 3 vector 3
[ 3855.557630] set->mq_map[14] queue 4 vector 4
[ 3855.562105] set->mq_map[15] queue 5 vector 5

Each comp_vector has affinity to all numa-local cores:

[ 4010.002954] iw_cxgb4: comp_vector 0, irq 203 mask 0xff00
[ 4010.008606] iw_cxgb4: comp_vector 1, irq 204 mask 0xff00
[ 4010.014179] iw_cxgb4: comp_vector 2, irq 205 mask 0xff00
[ 4010.019741] iw_cxgb4: comp_vector 3, irq 206 mask 0xff00
[ 4010.025310] iw_cxgb4: comp_vector 4, irq 207 mask 0xff00
[ 4010.030881] iw_cxgb4: comp_vector 5, irq 208 mask 0xff00
[ 4010.036448] iw_cxgb4: comp_vector 6, irq 209 mask 0xff00
[ 4010.042012] iw_cxgb4: comp_vector 7, irq 210 mask 0xff00
[ 4010.047562] iw_cxgb4: comp_vector 8, irq 211 mask 0xff00
[ 4010.053103] iw_cxgb4: comp_vector 9, irq 212 mask 0xff00
[ 4010.058632] iw_cxgb4: comp_vector 10, irq 213 mask 0xff00
[ 4010.064248] iw_cxgb4: comp_vector 11, irq 214 mask 0xff00
[ 4010.069863] iw_cxgb4: comp_vector 12, irq 215 mask 0xff00
[ 4010.075462] iw_cxgb4: comp_vector 13, irq 216 mask 0xff00
[ 4010.081066] iw_cxgb4: comp_vector 14, irq 217 mask 0xff00
[ 4010.086676] iw_cxgb4: comp_vector 15, irq 218 mask 0xff00
[ 4010.092283] set->mq_map[0] queue 8 vector 8
[ 4010.096683] set->mq_map[1] queue 9 vector 9
[ 4010.101085] 

Re: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask

2018-08-16 Thread Steve Wise



On 8/16/2018 1:26 PM, Sagi Grimberg wrote:
>
>> Let me know if you want me to try this or any particular fix.
>
> Steve, can you test this one?

Yes!  I'll try it out tomorrow. 

Stevo

> -- 
> [PATCH rfc] block: fix rdma queue mapping
>
> nvme-rdma attempts to map queues based on irq vector affinity.
> However, for some devices, completion vector irq affinity is
> configurable by the user which can break the existing assumption
> that irq vectors are optimally arranged over the host cpu cores.
>
> So we map queues in two stages:
> First map queues according to corresponding to the completion
> vector IRQ affinity taking the first cpu in the vector affinity map.
> if the current irq affinity is arranged such that a vector is not
> assigned to any distinct cpu, we map it to a cpu that is on the same
> node. If numa affinity can not be sufficed, we map it to any unmapped
> cpu we can find. Then, map the remaining cpus in the possible cpumap
> naively.
>
> Signed-off-by: Sagi Grimberg 
> ---
> Steve, can you test out this patch?
>  block/blk-mq-cpumap.c  | 39 +---
>  block/blk-mq-rdma.c    | 80
> +++---
>  include/linux/blk-mq.h |  1 +
>  3 files changed, 93 insertions(+), 27 deletions(-)
>
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index 3eb169f15842..34811db8cba9 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -30,30 +30,35 @@ static int get_first_sibling(unsigned int cpu)
>     return cpu;
>  }
>
> -int blk_mq_map_queues(struct blk_mq_tag_set *set)
> +void blk_mq_map_queue_cpu(struct blk_mq_tag_set *set, unsigned int cpu)
>  {
>     unsigned int *map = set->mq_map;
>     unsigned int nr_queues = set->nr_hw_queues;
> -   unsigned int cpu, first_sibling;
> +   unsigned int first_sibling;
>
> -   for_each_possible_cpu(cpu) {
> -   /*
> -    * First do sequential mapping between CPUs and queues.
> -    * In case we still have CPUs to map, and we have some
> number of
> -    * threads per cores then map sibling threads to the
> same queue for
> -    * performace optimizations.
> -    */
> -   if (cpu < nr_queues) {
> +   /*
> +    * First do sequential mapping between CPUs and queues.
> +    * In case we still have CPUs to map, and we have some number of
> +    * threads per cores then map sibling threads to the same
> queue for
> +    * performace optimizations.
> +    */
> +   if (cpu < nr_queues) {
> +   map[cpu] = cpu_to_queue_index(nr_queues, cpu);
> +   } else {
> +   first_sibling = get_first_sibling(cpu);
> +   if (first_sibling == cpu)
>     map[cpu] = cpu_to_queue_index(nr_queues, cpu);
> -   } else {
> -   first_sibling = get_first_sibling(cpu);
> -   if (first_sibling == cpu)
> -   map[cpu] =
> cpu_to_queue_index(nr_queues, cpu);
> -   else
> -   map[cpu] = map[first_sibling];
> -   }
> +   else
> +   map[cpu] = map[first_sibling];
>     }
> +}
> +
> +int blk_mq_map_queues(struct blk_mq_tag_set *set)
> +{
> +   unsigned int cpu;
>
> +   for_each_possible_cpu(cpu)
> +   blk_mq_map_queue_cpu(set, cpu);
>     return 0;
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_map_queues);
> diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
> index 996167f1de18..d04cbb1925f5 100644
> --- a/block/blk-mq-rdma.c
> +++ b/block/blk-mq-rdma.c
> @@ -14,6 +14,61 @@
>  #include 
>  #include 
>
> +static int blk_mq_rdma_map_queue(struct blk_mq_tag_set *set,
> +   struct ib_device *dev, int first_vec, unsigned int queue)
> +{
> +   const struct cpumask *mask;
> +   unsigned int cpu;
> +   bool mapped = false;
> +
> +   mask = ib_get_vector_affinity(dev, first_vec + queue);
> +   if (!mask)
> +   return -ENOTSUPP;
> +
> +   /* map with an unmapped cpu according to affinity mask */
> +   for_each_cpu(cpu, mask) {
> +   if (set->mq_map[cpu] == UINT_MAX) {
> +   set->mq_map[cpu] = queue;
> +   mapped = true;
> +   break;
> +   }
> +   }
> +
> +   if (!mapped) {
> +   int n;
> +
> +   /* map with an unmapped cpu in the same numa node */
> +   for_each_node(n) {
> +   const struct cpumask *node_cpumask =
> cpumask_of_node(n);
> +
> +   if (!cpumask_intersects(mask, node_cpumask))
> +   continue;
> +
> +   for_each_cpu(cpu, node_cpumask) {
> +   if (set->mq_map[cpu] == UINT_MAX) {
> +   set->mq_map[cpu] = queue;
> +  

Re: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask

2018-08-06 Thread Steve Wise



On 8/1/2018 9:27 AM, Max Gurtovoy wrote:
>
>
> On 8/1/2018 8:12 AM, Sagi Grimberg wrote:
>> Hi Max,
>
> Hi,
>
>>
>>> Yes, since nvmf is the only user of this function.
>>> Still waiting for comments on the suggested patch :)
>>>
>>
>> Sorry for the late response (but I'm on vacation so I have
>> an excuse ;))
>
> NP :) currently the code works..
>
>>
>> I'm thinking that we should avoid trying to find an assignment
>> when stuff like irqbalance daemon is running and changing
>> the affinitization.
>
> but this is exactly what Steve complained and Leon try to fix (and
> break the connection establishment).
> If this is the case and we all agree then we're good without Leon's
> patch and without our suggestions.
>

I don't agree.  Currently setting certain affinity mappings breaks nvme
connectivity.  I don't think that is desirable.  And mlx5 is broken in
that it doesn't allow changing the affinity but silently ignores the
change, which misleads the admin or irqbalance...
 


>>
>> This extension was made to apply optimal affinity assignment
>> when the device irq affinity is lined up in a vector per
>> core.
>>
>> I'm thinking that when we identify this is not the case, we immediately
>> fallback to the default mapping.
>>
>> 1. when we get a mask, if its weight != 1, we fallback.
>> 2. if a queue was left unmapped, we fallback.
>>
>> Maybe something like the following:
>
> did you test it ? I think it will not work since you need to map all
> the queues and all the CPUs.
>
>> -- 
>> diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
>> index 996167f1de18..1ada6211c55e 100644
>> --- a/block/blk-mq-rdma.c
>> +++ b/block/blk-mq-rdma.c
>> @@ -35,17 +35,26 @@ int blk_mq_rdma_map_queues(struct blk_mq_tag_set
>> *set,
>>  const struct cpumask *mask;
>>  unsigned int queue, cpu;
>>
>> +   /* reset all CPUs mapping */
>> +   for_each_possible_cpu(cpu)
>> +   set->mq_map[cpu] = UINT_MAX;
>> +
>>  for (queue = 0; queue < set->nr_hw_queues; queue++) {
>>  mask = ib_get_vector_affinity(dev, first_vec + queue);
>>  if (!mask)
>>  goto fallback;
>>
>> -   for_each_cpu(cpu, mask)
>> -   set->mq_map[cpu] = queue;
>> +   if (cpumask_weight(mask) != 1)
>> +   goto fallback;
>> +
>> +   cpu = cpumask_first(mask);
>> +   if (set->mq_map[cpu] != UINT_MAX)
>> +   goto fallback;
>> +
>> +   set->mq_map[cpu] = queue;
>>  }
>>
>>  return 0;
>> -
>>   fallback:
>>  return blk_mq_map_queues(set);
>>   }
>> -- 
>
> see attached another algorithem that can improve the mapping (although
> it's not a short one)...
>
> it will try to map according to affinity mask, and also in case the
> mask weight > 1 it will try to be better than the naive mapping I
> suggest in the previous email.
>

Let me know if you want me to try this or any particular fix.

Steve.


Re: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask

2018-07-30 Thread Steve Wise



On 7/23/2018 11:53 AM, Max Gurtovoy wrote:
>
>
> On 7/23/2018 7:49 PM, Jason Gunthorpe wrote:
>> On Fri, Jul 20, 2018 at 04:25:32AM +0300, Max Gurtovoy wrote:
>>>
>> [ 2032.194376] nvme nvme0: failed to connect queue: 9 ret=-18
>
> queue 9 is not mapped (overlap).
> please try the bellow:
>

 This seems to work.  Here are three mapping cases:  each vector on its
 own cpu, each vector on 1 cpu within the local numa node, and each
 vector having all cpus in its numa node.  The 2nd mapping looks kinda
 funny, but I think it achieved what you wanted?  And all the cases
 resulted in successful connections.

>>>
>>> Thanks for testing this.
>>> I slightly improved the setting of the left CPUs and actually used
>>> Sagi's
>>> initial proposal.
>>>
>>> Sagi,
>>> please review the attached patch and let me know if I should add your
>>> signature on it.
>>> I'll run some perf test early next week on it (meanwhile I run
>>> login/logout
>>> with different num_queues successfully and irq settings).
>>>
>>> Steve,
>>> It will be great if you can apply the attached in your system and
>>> send your
>>> findings.
>>>
>>> Regards,
>>> Max,
>>
>> So the conlusion to this thread is that Leon's mlx5 patch needs to wait
>> until this block-mq patch is accepted?
>
> Yes, since nvmf is the only user of this function.
> Still waiting for comments on the suggested patch :)

Hey Sagi, what do you think of Max's patch?

Max, should you resend this in a form suitable for merging?

Thanks,

Steve.


Re: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask

2018-07-24 Thread Steve Wise


On 7/24/2018 10:24 AM, Steve Wise wrote:
>
> On 7/19/2018 8:25 PM, Max Gurtovoy wrote:
>>>>> [ 2032.194376] nvme nvme0: failed to connect queue: 9 ret=-18
>>>> queue 9 is not mapped (overlap).
>>>> please try the bellow:
>>>>
>>> This seems to work.  Here are three mapping cases:  each vector on its
>>> own cpu, each vector on 1 cpu within the local numa node, and each
>>> vector having all cpus in its numa node.  The 2nd mapping looks kinda
>>> funny, but I think it achieved what you wanted?  And all the cases
>>> resulted in successful connections.
>>>
>> Thanks for testing this.
>> I slightly improved the setting of the left CPUs and actually used
>> Sagi's initial proposal.
>>
>> Sagi,
>> please review the attached patch and let me know if I should add your
>> signature on it.
>> I'll run some perf test early next week on it (meanwhile I run
>> login/logout with different num_queues successfully and irq settings).
>>
>> Steve,
>> It will be great if you can apply the attached in your system and send
>> your findings.
> Sorry, I got side tracked.  I'll try and test this today and report back.
>
> Steve.


###  each vector gets a unique cpu, starting with node-local:

[  754.976577] iw_cxgb4: comp_vector 0, irq 203 mask 0x100
[  754.982378] iw_cxgb4: comp_vector 1, irq 204 mask 0x200
[  754.988167] iw_cxgb4: comp_vector 2, irq 205 mask 0x400
[  754.993935] iw_cxgb4: comp_vector 3, irq 206 mask 0x800
[  754.999686] iw_cxgb4: comp_vector 4, irq 207 mask 0x1000
[  755.005509] iw_cxgb4: comp_vector 5, irq 208 mask 0x2000
[  755.011318] iw_cxgb4: comp_vector 6, irq 209 mask 0x4000
[  755.017124] iw_cxgb4: comp_vector 7, irq 210 mask 0x8000
[  755.022915] iw_cxgb4: comp_vector 8, irq 211 mask 0x1
[  755.028437] iw_cxgb4: comp_vector 9, irq 212 mask 0x2
[  755.033948] iw_cxgb4: comp_vector 10, irq 213 mask 0x4
[  755.039543] iw_cxgb4: comp_vector 11, irq 214 mask 0x8
[  755.045135] iw_cxgb4: comp_vector 12, irq 215 mask 0x10
[  755.050801] iw_cxgb4: comp_vector 13, irq 216 mask 0x20
[  755.056464] iw_cxgb4: comp_vector 14, irq 217 mask 0x40
[  755.062117] iw_cxgb4: comp_vector 15, irq 218 mask 0x80
[  755.067767] blk_mq_rdma_map_queues: set->mq_map[0] queue 8 vector 8
[  755.067767] blk_mq_rdma_map_queues: set->mq_map[1] queue 9 vector 9
[  755.067768] blk_mq_rdma_map_queues: set->mq_map[2] queue 10 vector 10
[  755.067769] blk_mq_rdma_map_queues: set->mq_map[3] queue 11 vector 11
[  755.067769] blk_mq_rdma_map_queues: set->mq_map[4] queue 12 vector 12
[  755.067770] blk_mq_rdma_map_queues: set->mq_map[5] queue 13 vector 13
[  755.067771] blk_mq_rdma_map_queues: set->mq_map[6] queue 14 vector 14
[  755.067772] blk_mq_rdma_map_queues: set->mq_map[7] queue 15 vector 15
[  755.067772] blk_mq_rdma_map_queues: set->mq_map[8] queue 0 vector 0
[  755.067773] blk_mq_rdma_map_queues: set->mq_map[9] queue 1 vector 1
[  755.067774] blk_mq_rdma_map_queues: set->mq_map[10] queue 2 vector 2
[  755.067774] blk_mq_rdma_map_queues: set->mq_map[11] queue 3 vector 3
[  755.067775] blk_mq_rdma_map_queues: set->mq_map[12] queue 4 vector 4
[  755.067775] blk_mq_rdma_map_queues: set->mq_map[13] queue 5 vector 5
[  755.067776] blk_mq_rdma_map_queues: set->mq_map[14] queue 6 vector 6
[  755.06] blk_mq_rdma_map_queues: set->mq_map[15] queue 7 vector 7

###  each vector gets one cpu within the local node:

[  777.590913] iw_cxgb4: comp_vector 0, irq 203 mask 0x400
[  777.596588] iw_cxgb4: comp_vector 1, irq 204 mask 0x800
[  777.602249] iw_cxgb4: comp_vector 2, irq 205 mask 0x1000
[  777.607984] iw_cxgb4: comp_vector 3, irq 206 mask 0x2000
[  777.613708] iw_cxgb4: comp_vector 4, irq 207 mask 0x4000
[  777.619431] iw_cxgb4: comp_vector 5, irq 208 mask 0x8000
[  777.625142] iw_cxgb4: comp_vector 6, irq 209 mask 0x100
[  777.630762] iw_cxgb4: comp_vector 7, irq 210 mask 0x200
[  777.636373] iw_cxgb4: comp_vector 8, irq 211 mask 0x400
[  777.641982] iw_cxgb4: comp_vector 9, irq 212 mask 0x800
[  777.647583] iw_cxgb4: comp_vector 10, irq 213 mask 0x1000
[  777.653353] iw_cxgb4: comp_vector 11, irq 214 mask 0x2000
[  777.659119] iw_cxgb4: comp_vector 12, irq 215 mask 0x4000
[  777.664877] iw_cxgb4: comp_vector 13, irq 216 mask 0x8000
[  777.670628] iw_cxgb4: comp_vector 14, irq 217 mask 0x100
[  777.676289] iw_cxgb4: comp_vector 15, irq 218 mask 0x200
[  777.681946] blk_mq_rdma_map_queues: set->mq_map[0] queue 8 vector 8
[  777.681947] blk_mq_rdma_map_queues: set->mq_map[1] queue 9 vector 9
[  777.681947] blk_mq_rdma_map_queues: set->mq_map[2] queue 10 vector 10
[  777.681948] blk_mq_rdma_map_queues: set->mq_map[3] queue 11 vector 11
[  777.681948] blk_mq_rdma_map_queues: set->mq_map[4] queue 12 vector 12
[  777.681949] blk_mq_rdma_map_queues: set->mq_map[5]

Re: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask

2018-07-24 Thread Steve Wise



On 7/19/2018 8:25 PM, Max Gurtovoy wrote:
>
 [ 2032.194376] nvme nvme0: failed to connect queue: 9 ret=-18
>>>
>>> queue 9 is not mapped (overlap).
>>> please try the bellow:
>>>
>>
>> This seems to work.  Here are three mapping cases:  each vector on its
>> own cpu, each vector on 1 cpu within the local numa node, and each
>> vector having all cpus in its numa node.  The 2nd mapping looks kinda
>> funny, but I think it achieved what you wanted?  And all the cases
>> resulted in successful connections.
>>
>
> Thanks for testing this.
> I slightly improved the setting of the left CPUs and actually used
> Sagi's initial proposal.
>
> Sagi,
> please review the attached patch and let me know if I should add your
> signature on it.
> I'll run some perf test early next week on it (meanwhile I run
> login/logout with different num_queues successfully and irq settings).
>
> Steve,
> It will be great if you can apply the attached in your system and send
> your findings.

Sorry, I got side tracked.  I'll try and test this today and report back.

Steve.


Re: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask

2018-07-19 Thread Steve Wise



On 7/19/2018 9:50 AM, Max Gurtovoy wrote:
>
>
> On 7/18/2018 10:29 PM, Steve Wise wrote:
>>
>>>
>>> On 7/18/2018 2:38 PM, Sagi Grimberg wrote:
>>>>
>>>>>> IMO we must fulfil the user wish to connect to N queues and not
>>>>>> reduce
>>>>>> it because of affinity overlaps. So in order to push Leon's patch we
>>>>>> must also fix the blk_mq_rdma_map_queues to do a best effort
>>> mapping
>>>>>> according the affinity and map the rest in naive way (in that way we
>>>>>> will *always* map all the queues).
>>>>>
>>>>> That is what I would expect also.   For example, in my node, where
>>>>> there are
>>>>> 16 cpus, and 2 numa nodes, I observe much better nvmf IOPS
>>> performance by
>>>>> setting up my 16 driver completion event queues such that each is
>>>>> bound to a
>>>>> node-local cpu.  So I end up with each nodel-local cpu having 2
>>>>> queues
>>>>> bound
>>>>> to it.   W/O adding support in iw_cxgb4 for ib_get_vector_affinity(),
>>>>> this
>>>>> works fine.   I assumed adding ib_get_vector_affinity() would allow
>>>>> this to
>>>>> all "just work" by default, but I'm running into this connection
>>>>> failure
>>>>> issue.
>>>>>
>>>>> I don't understand exactly what the blk_mq layer is trying to do,
>>>>> but I
>>>>> assume it has ingress event queues and processing that it trying
>>>>> to align
>>>>> with the drivers ingress cq event handling, so everybody stays on the
>>>>> same
>>>>> cpu (or at least node).   But something else is going on.  Is there
>>>>> documentation on how this works somewhere?
>>>>
>>>> Does this (untested) patch help?
>>>
>>> I'm not sure (I'll test it tomorrow) because the issue is the unmapped
>>> queues and not the cpus.
>>> for example, if the affinity of q=6 and q=12 returned the same cpumask
>>> than q=6 will not be mapped and will fail to connect.
>>>
>>
>> Attached is a patch that applies cleanly for me.  It has problems if
>> vectors have affinity to more than 1 cpu:
>>
>> [ 2031.91] iw_cxgb4: comp_vector 0, irq 203 mask 0xff00
>> [ 2031.994706] iw_cxgb4: comp_vector 1, irq 204 mask 0xff00
>> [ 2032.000348] iw_cxgb4: comp_vector 2, irq 205 mask 0xff00
>> [ 2032.005992] iw_cxgb4: comp_vector 3, irq 206 mask 0xff00
>> [ 2032.011629] iw_cxgb4: comp_vector 4, irq 207 mask 0xff00
>> [ 2032.017271] iw_cxgb4: comp_vector 5, irq 208 mask 0xff00
>> [ 2032.022901] iw_cxgb4: comp_vector 6, irq 209 mask 0xff00
>> [ 2032.028514] iw_cxgb4: comp_vector 7, irq 210 mask 0xff00
>> [ 2032.034110] iw_cxgb4: comp_vector 8, irq 211 mask 0xff00
>> [ 2032.039677] iw_cxgb4: comp_vector 9, irq 212 mask 0xff00
>> [ 2032.045244] iw_cxgb4: comp_vector 10, irq 213 mask 0xff00
>> [ 2032.050889] iw_cxgb4: comp_vector 11, irq 214 mask 0xff00
>> [ 2032.056531] iw_cxgb4: comp_vector 12, irq 215 mask 0xff00
>> [ 2032.062174] iw_cxgb4: comp_vector 13, irq 216 mask 0xff00
>> [ 2032.067817] iw_cxgb4: comp_vector 14, irq 217 mask 0xff00
>> [ 2032.073457] iw_cxgb4: comp_vector 15, irq 218 mask 0xff00
>> [ 2032.079102] blk_mq_rdma_map_queues: set->mq_map[0] queue 0 vector 0
>> [ 2032.085621] blk_mq_rdma_map_queues: set->mq_map[1] queue 1 vector 1
>> [ 2032.092139] blk_mq_rdma_map_queues: set->mq_map[2] queue 2 vector 2
>> [ 2032.098658] blk_mq_rdma_map_queues: set->mq_map[3] queue 3 vector 3
>> [ 2032.105177] blk_mq_rdma_map_queues: set->mq_map[4] queue 4 vector 4
>> [ 2032.111689] blk_mq_rdma_map_queues: set->mq_map[5] queue 5 vector 5
>> [ 2032.118208] blk_mq_rdma_map_queues: set->mq_map[6] queue 6 vector 6
>> [ 2032.124728] blk_mq_rdma_map_queues: set->mq_map[7] queue 7 vector 7
>> [ 2032.131246] blk_mq_rdma_map_queues: set->mq_map[8] queue 15 vector 15
>> [ 2032.137938] blk_mq_rdma_map_queues: set->mq_map[9] queue 15 vector 15
>> [ 2032.144629] blk_mq_rdma_map_queues: set->mq_map[10] queue 15
>> vector 15
>> [ 2032.151401] blk_mq_rdma_map_queues: set->mq_map[11] queue 15
>> vector 15
>> [ 2032.158172] blk_mq_rdma_map_queues: set->mq_map[12] queue 15
>> vector 15
>> [ 2032.164940] blk_mq_rdma_map_queues: set->mq_map[13] queue 15
>> vector 15
>> [ 2032.171709] blk_mq_rdma_map_queues: 

RE: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask

2018-07-18 Thread Steve Wise

> 
> On 7/18/2018 2:38 PM, Sagi Grimberg wrote:
> >
> >>> IMO we must fulfil the user wish to connect to N queues and not reduce
> >>> it because of affinity overlaps. So in order to push Leon's patch we
> >>> must also fix the blk_mq_rdma_map_queues to do a best effort
> mapping
> >>> according the affinity and map the rest in naive way (in that way we
> >>> will *always* map all the queues).
> >>
> >> That is what I would expect also.   For example, in my node, where
> >> there are
> >> 16 cpus, and 2 numa nodes, I observe much better nvmf IOPS
> performance by
> >> setting up my 16 driver completion event queues such that each is
> >> bound to a
> >> node-local cpu.  So I end up with each nodel-local cpu having 2 queues
> >> bound
> >> to it.   W/O adding support in iw_cxgb4 for ib_get_vector_affinity(),
> >> this
> >> works fine.   I assumed adding ib_get_vector_affinity() would allow
> >> this to
> >> all "just work" by default, but I'm running into this connection failure
> >> issue.
> >>
> >> I don't understand exactly what the blk_mq layer is trying to do, but I
> >> assume it has ingress event queues and processing that it trying to align
> >> with the drivers ingress cq event handling, so everybody stays on the
> >> same
> >> cpu (or at least node).   But something else is going on.  Is there
> >> documentation on how this works somewhere?
> >
> > Does this (untested) patch help?
> 
> I'm not sure (I'll test it tomorrow) because the issue is the unmapped
> queues and not the cpus.
> for example, if the affinity of q=6 and q=12 returned the same cpumask
> than q=6 will not be mapped and will fail to connect.
>

Attached is a patch that applies cleanly for me.  It has problems if vectors 
have affinity to more than 1 cpu:

[ 2031.91] iw_cxgb4: comp_vector 0, irq 203 mask 0xff00
[ 2031.994706] iw_cxgb4: comp_vector 1, irq 204 mask 0xff00
[ 2032.000348] iw_cxgb4: comp_vector 2, irq 205 mask 0xff00
[ 2032.005992] iw_cxgb4: comp_vector 3, irq 206 mask 0xff00
[ 2032.011629] iw_cxgb4: comp_vector 4, irq 207 mask 0xff00
[ 2032.017271] iw_cxgb4: comp_vector 5, irq 208 mask 0xff00
[ 2032.022901] iw_cxgb4: comp_vector 6, irq 209 mask 0xff00
[ 2032.028514] iw_cxgb4: comp_vector 7, irq 210 mask 0xff00
[ 2032.034110] iw_cxgb4: comp_vector 8, irq 211 mask 0xff00
[ 2032.039677] iw_cxgb4: comp_vector 9, irq 212 mask 0xff00
[ 2032.045244] iw_cxgb4: comp_vector 10, irq 213 mask 0xff00
[ 2032.050889] iw_cxgb4: comp_vector 11, irq 214 mask 0xff00
[ 2032.056531] iw_cxgb4: comp_vector 12, irq 215 mask 0xff00
[ 2032.062174] iw_cxgb4: comp_vector 13, irq 216 mask 0xff00
[ 2032.067817] iw_cxgb4: comp_vector 14, irq 217 mask 0xff00
[ 2032.073457] iw_cxgb4: comp_vector 15, irq 218 mask 0xff00
[ 2032.079102] blk_mq_rdma_map_queues: set->mq_map[0] queue 0 vector 0
[ 2032.085621] blk_mq_rdma_map_queues: set->mq_map[1] queue 1 vector 1
[ 2032.092139] blk_mq_rdma_map_queues: set->mq_map[2] queue 2 vector 2
[ 2032.098658] blk_mq_rdma_map_queues: set->mq_map[3] queue 3 vector 3
[ 2032.105177] blk_mq_rdma_map_queues: set->mq_map[4] queue 4 vector 4
[ 2032.111689] blk_mq_rdma_map_queues: set->mq_map[5] queue 5 vector 5
[ 2032.118208] blk_mq_rdma_map_queues: set->mq_map[6] queue 6 vector 6
[ 2032.124728] blk_mq_rdma_map_queues: set->mq_map[7] queue 7 vector 7
[ 2032.131246] blk_mq_rdma_map_queues: set->mq_map[8] queue 15 vector 15
[ 2032.137938] blk_mq_rdma_map_queues: set->mq_map[9] queue 15 vector 15
[ 2032.144629] blk_mq_rdma_map_queues: set->mq_map[10] queue 15 vector 15
[ 2032.151401] blk_mq_rdma_map_queues: set->mq_map[11] queue 15 vector 15
[ 2032.158172] blk_mq_rdma_map_queues: set->mq_map[12] queue 15 vector 15
[ 2032.164940] blk_mq_rdma_map_queues: set->mq_map[13] queue 15 vector 15
[ 2032.171709] blk_mq_rdma_map_queues: set->mq_map[14] queue 15 vector 15
[ 2032.178477] blk_mq_rdma_map_queues: set->mq_map[15] queue 15 vector 15
[ 2032.187409] nvme nvme0: Connect command failed, error wo/DNR bit: -16402
[ 2032.194376] nvme nvme0: failed to connect queue: 9 ret=-18
 
But if I set all my vector affinities single cpus but only those in the same 
numa node, it now works:

[ 2311.884397] iw_cxgb4: comp_vector 0, irq 203 mask 0x100
[ 2311.890103] iw_cxgb4: comp_vector 1, irq 204 mask 0x200
[ 2311.895659] iw_cxgb4: comp_vector 2, irq 205 mask 0x400
[ 2311.901211] iw_cxgb4: comp_vector 3, irq 206 mask 0x800
[ 2311.906758] iw_cxgb4: comp_vector 4, irq 207 mask 0x1000
[ 2311.912390] iw_cxgb4: comp_vector 5, irq 208 mask 0x2000
[ 2311.918014] iw_cxgb4: comp_vector 6, irq 209 mask 0x4000
[ 2311.923627] iw_cxgb4: comp_vector 7, irq 210 mask 0x8000
[ 2311.929213] iw_cxgb4: comp_vector 8, irq 211 mask 0x100
[ 2311.934694] iw_cxgb4: comp_vector 9, irq 212 mask 0x200
[ 2311.940163] iw_cxgb4: comp_vector 10, irq 213 mask 0x400
[ 2311.945716] iw_cxgb4: comp_vector 11, irq 214 mask 0x800
[ 2311.951272] iw_cxgb4: comp_vector 12, irq 215 mask 0x1000
[ 2311.956914] iw_cxgb4: comp_vector 13, irq 216 mask 0x2000
[ 2311.962558] iw_cxgb4: comp_vec

RE: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask

2018-07-18 Thread Steve Wise



> -Original Message-
> From: Max Gurtovoy 
> Sent: Wednesday, July 18, 2018 9:14 AM
> To: Sagi Grimberg ; Steve Wise
> ; 'Leon Romanovsky' 
> Cc: 'Doug Ledford' ; 'Jason Gunthorpe'
> ; 'RDMA mailing list' ;
> 'Saeed Mahameed' ; 'linux-netdev'
> 
> Subject: Re: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity
> mask
> 
> 
> 
> On 7/18/2018 2:38 PM, Sagi Grimberg wrote:
> >
> >>> IMO we must fulfil the user wish to connect to N queues and not reduce
> >>> it because of affinity overlaps. So in order to push Leon's patch we
> >>> must also fix the blk_mq_rdma_map_queues to do a best effort
> mapping
> >>> according the affinity and map the rest in naive way (in that way we
> >>> will *always* map all the queues).
> >>
> >> That is what I would expect also.   For example, in my node, where
> >> there are
> >> 16 cpus, and 2 numa nodes, I observe much better nvmf IOPS
> performance by
> >> setting up my 16 driver completion event queues such that each is
> >> bound to a
> >> node-local cpu.  So I end up with each nodel-local cpu having 2 queues
> >> bound
> >> to it.   W/O adding support in iw_cxgb4 for ib_get_vector_affinity(),
> >> this
> >> works fine.   I assumed adding ib_get_vector_affinity() would allow
> >> this to
> >> all "just work" by default, but I'm running into this connection failure
> >> issue.
> >>
> >> I don't understand exactly what the blk_mq layer is trying to do, but I
> >> assume it has ingress event queues and processing that it trying to align
> >> with the drivers ingress cq event handling, so everybody stays on the
> >> same
> >> cpu (or at least node).   But something else is going on.  Is there
> >> documentation on how this works somewhere?
> >
> > Does this (untested) patch help?
> 
> I'm not sure (I'll test it tomorrow) because the issue is the unmapped
> queues and not the cpus.
> for example, if the affinity of q=6 and q=12 returned the same cpumask
> than q=6 will not be mapped and will fail to connect.
> 
> > --
> > diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> > index 3eb169f15842..dbe962cb537d 100644
> > --- a/block/blk-mq-cpumap.c
> > +++ b/block/blk-mq-cpumap.c
> > @@ -30,29 +30,34 @@ static int get_first_sibling(unsigned int cpu)
> >  return cpu;
> >   }
> >
> > -int blk_mq_map_queues(struct blk_mq_tag_set *set)
> > +void blk_mq_map_queue(struct blk_mq_tag_set *set, unsigned int cpu)
> >   {

There is already a static inline function named blk_mq_map_queue() in 
block/blk-mq.h.  Did you mean to replace that?  Or is this just a function name 
conflict?


> >  unsigned int *map = set->mq_map;
> >  unsigned int nr_queues = set->nr_hw_queues;
> > -   unsigned int cpu, first_sibling;
> > +   unsigned int first_sibling;
> >
> > -   for_each_possible_cpu(cpu) {
> > -   /*
> > -* First do sequential mapping between CPUs and queues.
> > -* In case we still have CPUs to map, and we have some
> > number of
> > -* threads per cores then map sibling threads to the
> > same queue for
> > -* performace optimizations.
> > -*/
> > -   if (cpu < nr_queues) {
> > +   /*
> > +* First do sequential mapping between CPUs and queues.
> > +* In case we still have CPUs to map, and we have some number of
> > +* threads per cores then map sibling threads to the same queue for
> > +* performace optimizations.
> > +*/
> > +   if (cpu < nr_queues) {
> > +   map[cpu] = cpu_to_queue_index(nr_queues, cpu);
> > +   } else {
> > +   first_sibling = get_first_sibling(cpu);
> > +   if (first_sibling == cpu)
> >  map[cpu] = cpu_to_queue_index(nr_queues, cpu);
> > -   } else {
> > -   first_sibling = get_first_sibling(cpu);
> > -   if (first_sibling == cpu)
> > -   map[cpu] = cpu_to_queue_index(nr_queues,
> > cpu);
> > -   else
> > -   map[cpu] = map[first_sibling];
> > -   }
> > +   else
> > +   map[cpu] = map[first_sibling];

RE: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask

2018-07-17 Thread Steve Wise


> On 7/16/2018 8:08 PM, Steve Wise wrote:
> > Hey Max:
> >
> >
> 
> Hey,
> 
> > On 7/16/2018 11:46 AM, Max Gurtovoy wrote:
> >>
> >>
> >> On 7/16/2018 5:59 PM, Sagi Grimberg wrote:
> >>>
> >>>> Hi,
> >>>> I've tested this patch and seems problematic at this moment.
> >>>
> >>> Problematic how? what are you seeing?
> >>
> >> Connection failures and same error Steve saw:
> >>
> >> [Mon Jul 16 16:19:11 2018] nvme nvme0: Connect command failed, error
> >> wo/DNR bit: -16402
> >> [Mon Jul 16 16:19:11 2018] nvme nvme0: failed to connect queue: 2 ret=-
> 18
> >>
> >>
> >>>
> >>>> maybe this is because of the bug that Steve mentioned in the NVMe
> >>>> mailing list. Sagi mentioned that we should fix it in the NVMe/RDMA
> >>>> initiator and I'll run his suggestion as well.
> >>>
> >>> Is your device irq affinity linear?
> >>
> >> When it's linear and the balancer is stopped the patch works.
> >>
> >>>
> >>>> BTW, when I run the blk_mq_map_queues it works for every irq
> affinity.
> >>>
> >>> But its probably not aligned to the device vector affinity.
> >>
> >> but I guess it's better in some cases.
> >>
> >> I've checked the situation before Leon's patch and set all the vetcors
> >> to CPU 0. In this case (I think that this was the initial report by
> >> Steve), we use the affinity_hint (Israel's and Saeed's patches were we
> >> use dev->priv.irq_info[vector].mask) and it worked fine.
> >>
> >> Steve,
> >> Can you share your configuration (kernel, HCA, affinity map, connect
> >> command, lscpu) ?
> >> I want to repro it in my lab.
> >>
> >
> > - linux-4.18-rc1 + the nvme/nvmet inline_data_size patches + patches to
> > enable ib_get_vector_affinity() in cxgb4 + sagi's patch + leon's mlx5
> > patch so I can change the affinity via procfs.
> 
> ohh, now I understand that you where complaining regarding the affinity
> change reflection to mlx5_ib_get_vector_affinity and not regarding the
> failures on connecting while the affinity overlaps (that is working good
> before Leon's patch).
> So this is a known issue since we used a static hint that never changes
> from dev->priv.irq_info[vector].mask.
> 
> IMO we must fulfil the user wish to connect to N queues and not reduce
> it because of affinity overlaps. So in order to push Leon's patch we
> must also fix the blk_mq_rdma_map_queues to do a best effort mapping
> according the affinity and map the rest in naive way (in that way we
> will *always* map all the queues).

That is what I would expect also.   For example, in my node, where there are
16 cpus, and 2 numa nodes, I observe much better nvmf IOPS performance by
setting up my 16 driver completion event queues such that each is bound to a
node-local cpu.  So I end up with each nodel-local cpu having 2 queues bound
to it.   W/O adding support in iw_cxgb4 for ib_get_vector_affinity(), this
works fine.   I assumed adding ib_get_vector_affinity() would allow this to
all "just work" by default, but I'm running into this connection failure
issue. 

I don't understand exactly what the blk_mq layer is trying to do, but I
assume it has ingress event queues and processing that it trying to align
with the drivers ingress cq event handling, so everybody stays on the same
cpu (or at least node).   But something else is going on.  Is there
documentation on how this works somewhere?

Thanks,

Steve



Re: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask

2018-07-16 Thread Steve Wise
Hey Max:


On 7/16/2018 11:46 AM, Max Gurtovoy wrote:
>
>
> On 7/16/2018 5:59 PM, Sagi Grimberg wrote:
>>
>>> Hi,
>>> I've tested this patch and seems problematic at this moment.
>>
>> Problematic how? what are you seeing?
>
> Connection failures and same error Steve saw:
>
> [Mon Jul 16 16:19:11 2018] nvme nvme0: Connect command failed, error
> wo/DNR bit: -16402
> [Mon Jul 16 16:19:11 2018] nvme nvme0: failed to connect queue: 2 ret=-18
>
>
>>
>>> maybe this is because of the bug that Steve mentioned in the NVMe
>>> mailing list. Sagi mentioned that we should fix it in the NVMe/RDMA
>>> initiator and I'll run his suggestion as well.
>>
>> Is your device irq affinity linear?
>
> When it's linear and the balancer is stopped the patch works.
>
>>
>>> BTW, when I run the blk_mq_map_queues it works for every irq affinity.
>>
>> But its probably not aligned to the device vector affinity.
>
> but I guess it's better in some cases.
>
> I've checked the situation before Leon's patch and set all the vetcors
> to CPU 0. In this case (I think that this was the initial report by
> Steve), we use the affinity_hint (Israel's and Saeed's patches were we
> use dev->priv.irq_info[vector].mask) and it worked fine.
>
> Steve,
> Can you share your configuration (kernel, HCA, affinity map, connect
> command, lscpu) ?
> I want to repro it in my lab.
>

- linux-4.18-rc1 + the nvme/nvmet inline_data_size patches + patches to
enable ib_get_vector_affinity() in cxgb4 + sagi's patch + leon's mlx5
patch so I can change the affinity via procfs. 

- mlx5 MT27700 RoCE card, cxgb4 T62100-CR iWARP card

- The system has 2 numa nodes with 8 real cpus in each == 16 cpus all
online.  HT disabled.

- i'm testing over HW loopback for simplicity, so the node is both the
nvme target and host.  Connecting one device like this: nvme connect -t
rdma -a 172.16.2.1 -n nvme-nullb0

- to reproduce the nvme-rdma bug, just map any two hca cq comp vectors
to the same cpu. 

- lscpu output:

[root@stevo1 linux]# lscpu
Architecture:  x86_64
CPU op-mode(s):    32-bit, 64-bit
Byte Order:    Little Endian
CPU(s):    16
On-line CPU(s) list:   0-15
Thread(s) per core:    1
Core(s) per socket:    8
Socket(s): 2
NUMA node(s):  2
Vendor ID: GenuineIntel
CPU family:    6
Model: 45
Model name:    Intel(R) Xeon(R) CPU E5-2687W 0 @ 3.10GHz
Stepping:  7
CPU MHz:   3400.057
CPU max MHz:   3800.
CPU min MHz:   1200.
BogoMIPS:  6200.10
Virtualization:    VT-x
L1d cache: 32K
L1i cache: 32K
L2 cache:  256K
L3 cache:  20480K
NUMA node0 CPU(s): 0-7
NUMA node1 CPU(s): 8-15
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr
pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe
syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good
nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor
ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2
x2apic popcnt tsc_deadline_timer aes xsave avx lahf_lm epb pti
tpr_shadow vnmi flexpriority ept vpid xsaveopt dtherm ida arat pln pts

Steve




Re: iproute2 won't compile without AF_VSOCK

2018-06-19 Thread Steve Wise



On 6/19/2018 3:29 PM, David Ahern wrote:
> On 6/19/18 2:27 PM, David Ahern wrote:
>> On 6/19/18 9:47 AM, Stephen Hemminger wrote:
>>> On Tue, 19 Jun 2018 10:17:45 -0500
>>> Steve Wise  wrote:
>>>
>>>> Hey David,
>>>>
>>>> I'm trying to compile the latest iproute2 on an RHEL-7.3 distro, and it
>>>> fails to compile because AF_VSOCK is not defined.  Should this
>>>> functionality be a configure option to disable it on older distros?
>>>>
>>>>
>>>> Thanks,
>>>>
>>>> Steve.
>>>>
>>>> 
>>>>
>>>> misc
>>>>     CC   ss.o
>>>> ss.c:301:27: error: ‘AF_VSOCK’ undeclared here (not in a function)
>>>>    .families = FAMILY_MASK(AF_VSOCK),
>>>>    ^
>>>> ss.c:252:46: note: in definition of macro ‘FAMILY_MASK’
>>>>  #define FAMILY_MASK(family) ((uint64_t)1 << (family))
>>>>   ^
>>>> ss.c:334:2: error: array index in initializer not of integer type
>>>>   [AF_VSOCK] = {
>>>>   ^
>>>> ss.c:334:2: error: (near initialization for ‘default_afs’)
>>>> make[1]: *** [ss.o] Error 1
>>>> make: *** [all] Error 2
>>>>
>>> Probably should just add an #ifdef to takeout that if not present
>>>
>> Most userspace tools have a compat header for cases like this.
>>
>> #ifndef AF_VSOCK
>> #define AF_VSOCK 40
>> #endif
>>
> Add the above to include//utils.h; AF_MPLS is already there.

I'll send out a patch.

Thanks,

Steve.


iproute2 won't compile without AF_VSOCK

2018-06-19 Thread Steve Wise
Hey David,

I'm trying to compile the latest iproute2 on an RHEL-7.3 distro, and it
fails to compile because AF_VSOCK is not defined.  Should this
functionality be a configure option to disable it on older distros?


Thanks,

Steve.



misc
    CC   ss.o
ss.c:301:27: error: ‘AF_VSOCK’ undeclared here (not in a function)
   .families = FAMILY_MASK(AF_VSOCK),
   ^
ss.c:252:46: note: in definition of macro ‘FAMILY_MASK’
 #define FAMILY_MASK(family) ((uint64_t)1 << (family))
  ^
ss.c:334:2: error: array index in initializer not of integer type
  [AF_VSOCK] = {
  ^
ss.c:334:2: error: (near initialization for ‘default_afs’)
make[1]: *** [ss.o] Error 1
make: *** [all] Error 2



[PATCH v3 iproute2-next 3/3] rdma: update man pages

2018-05-16 Thread Steve Wise
Update the man pages for the resource attributes as well
as the driver-specific attributes.

Signed-off-by: Steve Wise 
---
 man/man8/rdma-resource.8 | 29 ++---
 man/man8/rdma.8  |  2 +-
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/man/man8/rdma-resource.8 b/man/man8/rdma-resource.8
index ff5d25d..40b073d 100644
--- a/man/man8/rdma-resource.8
+++ b/man/man8/rdma-resource.8
@@ -7,13 +7,16 @@ rdma-resource \- rdma resource configuration
 .in +8
 .ti -8
 .B rdma
-.RI "[ " OPTIONS " ]"
-.B resource
-.RI  " { " COMMAND " | "
+.RI "[ " OPTIONS " ] " RESOURCE " { " COMMAND " | "
 .BR help " }"
 .sp
 
 .ti -8
+.IR RESOURCE " := { "
+.BR cm_id " | " cq " | " mr " | " pd " | " qp " }"
+.sp
+
+.ti -8
 .IR OPTIONS " := { "
 \fB\-j\fR[\fIson\fR] |
 \fB\-d\fR[\fIetails\fR] }
@@ -70,11 +73,31 @@ rdma res show qp link mlx5_4/- -d
 Detailed view.
 .RE
 .PP
+rdma res show qp link mlx5_4/- -dd
+.RS 4
+Detailed view including driver-specific details.
+.RE
+.PP
 rdma res show qp link mlx5_4/1 lqpn 0-6
 .RS 4
 Limit to specific Local QPNs.
 .RE
 .PP
+rdma resource show cm_id dst-port 7174
+.RS 4
+Show CM_IDs with destination ip port of 7174.
+.RE
+.PP
+rdma resource show cm_id src-addr 172.16.0.100
+.RS 4
+Show CM_IDs bound to local ip address 172.16.0.100
+.RE
+.PP
+rdma resource show cq pid 30489
+.RS 4
+Show CQs belonging to pid 30489
+.RE
+.PP
 
 .SH SEE ALSO
 .BR rdma (8),
diff --git a/man/man8/rdma.8 b/man/man8/rdma.8
index 6f88f37..12aa149 100644
--- a/man/man8/rdma.8
+++ b/man/man8/rdma.8
@@ -49,7 +49,7 @@ If there were any errors during execution of the commands, 
the application retur
 
 .TP
 .BR "\-d" , " --details"
-Output detailed information.
+Output detailed information.  Adding a second \-d includes driver-specific 
details.
 
 .TP
 .BR "\-p" , " --pretty"
-- 
1.8.3.1



[PATCH v3 iproute2-next 2/3] rdma: print driver resource attributes

2018-05-16 Thread Steve Wise
This enhancement allows printing rdma device-specific state, if provided
by the kernel. This is done in a generic manner, so rdma tool doesn't
need to know about the details of every type of rdma device.

Driver attributes for a rdma resource are in the form of  tuples, where the key is a string and the value can
be any supported driver attribute. The print_type attribute, if present,
provides a print format to use vs the standard print format for the type.
For example, the default print type for a PROVIDER_S32 value is "%d ",
but "0x%x " if the print_type of PRINT_TYPE_HEX is included inthe tuple.

Driver resources are only printed when the -dd flag is present.
If -p is present, then the output is formatted to not exceed 80 columns,
otherwise it is printed as a single row to be grep/awk friendly.

Example output:

# rdma resource show qp lqpn 1028 -dd -p
link cxgb4_0/- lqpn 1028 rqpn 0 type RC state RTS rq-psn 0 sq-psn 0 
path-mig-state MIGRATED pid 0 comm [nvme_rdma]
sqid 1028 flushed 0 memsize 123968 cidx 85 pidx 85 wq_pidx 106 flush_cidx 
85 in_use 0
size 386 flags 0x0 rqid 1029 memsize 16768 cidx 43 pidx 41 wq_pidx 171 msn 
44 rqt_hwaddr 0x2a8a5d00
rqt_size 256 in_use 128 size 130

Signed-off-by: Steve Wise 
---
 rdma/rdma.c  |   9 ++-
 rdma/rdma.h  |  11 
 rdma/res.c   |  30 +++--
 rdma/utils.c | 196 +++
 4 files changed, 224 insertions(+), 22 deletions(-)

diff --git a/rdma/rdma.c b/rdma/rdma.c
index b43e538..010e983 100644
--- a/rdma/rdma.c
+++ b/rdma/rdma.c
@@ -129,13 +129,14 @@ int main(int argc, char **argv)
{ "batch",  required_argument,  NULL, 'b' },
{ NULL, 0, NULL, 0 }
};
+   bool show_driver_details = false;
const char *batch_file = NULL;
bool pretty_output = false;
bool show_details = false;
bool json_output = false;
bool force = false;
-   char *filename;
struct rd rd = {};
+   char *filename;
int opt;
int err;
 
@@ -152,7 +153,10 @@ int main(int argc, char **argv)
pretty_output = true;
break;
case 'd':
-   show_details = true;
+   if (show_details)
+   show_driver_details = true;
+   else
+   show_details = true;
break;
case 'j':
json_output = true;
@@ -180,6 +184,7 @@ int main(int argc, char **argv)
argv += optind;
 
rd.show_details = show_details;
+   rd.show_driver_details = show_driver_details;
rd.json_output = json_output;
rd.pretty_output = pretty_output;
 
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 1908fc4..fcaf9e6 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -55,6 +55,7 @@ struct rd {
char **argv;
char *filename;
bool show_details;
+   bool show_driver_details;
struct list_head dev_map_list;
uint32_t dev_idx;
uint32_t port_idx;
@@ -115,4 +116,14 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void 
*data, uint32_t seq);
 void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t 
flags);
 int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
 int rd_attr_cb(const struct nlattr *attr, void *data);
+int rd_attr_check(const struct nlattr *attr, int *typep);
+
+/*
+ * Print helpers
+ */
+void print_driver_table(struct rd *rd, struct nlattr *tb);
+void newline(struct rd *rd);
+void newline_indent(struct rd *rd);
+#define MAX_LINE_LENGTH 80
+
 #endif /* _RDMA_TOOL_H_ */
diff --git a/rdma/res.c b/rdma/res.c
index 1a0aab6..074b992 100644
--- a/rdma/res.c
+++ b/rdma/res.c
@@ -439,10 +439,8 @@ static int res_qp_parse_cb(const struct nlmsghdr *nlh, 
void *data)
if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
free(comm);
 
-   if (rd->json_output)
-   jsonw_end_array(rd->jw);
-   else
-   pr_out("\n");
+   print_driver_table(rd, nla_line[RDMA_NLDEV_ATTR_DRIVER]);
+   newline(rd);
}
return MNL_CB_OK;
 }
@@ -678,10 +676,8 @@ static int res_cm_id_parse_cb(const struct nlmsghdr *nlh, 
void *data)
if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
free(comm);
 
-   if (rd->json_output)
-   jsonw_end_array(rd->jw);
-   else
-   pr_out("\n");
+   print_driver_table(rd, nla_line[RDMA_NLDEV_ATTR_DRIVER]);
+   newline(rd);
}
return MNL_CB_OK;
 }
@@ -804,10 +800,8 @@ static int res_cq_parse_cb(const struct nlmsghdr *nlh, 
void *data)
if (nla_line[RD

[PATCH v3 iproute2-next 1/3] rdma: update rdma_netlink.h to get new driver attributes

2018-05-16 Thread Steve Wise
Pull in the rdma_netlink.h changes from kernel
commits:

25a0ad85156a ("RDMA/nldev: Add explicit pad attribute")
da5c85078215 ("RDMA/nldev: add driver-specific resource tracking)"
0d52d803767e ("RDMA/uapi: Fix uapi breakage")

Signed-off-by: Steve Wise 
---
 rdma/include/uapi/rdma/rdma_netlink.h | 30 +-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/rdma/include/uapi/rdma/rdma_netlink.h 
b/rdma/include/uapi/rdma/rdma_netlink.h
index 60416ed..6513fb8 100644
--- a/rdma/include/uapi/rdma/rdma_netlink.h
+++ b/rdma/include/uapi/rdma/rdma_netlink.h
@@ -249,10 +249,22 @@ enum rdma_nldev_command {
RDMA_NLDEV_NUM_OPS
 };
 
+enum {
+   RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16,
+};
+
+enum rdma_nldev_print_type {
+   RDMA_NLDEV_PRINT_TYPE_UNSPEC,
+   RDMA_NLDEV_PRINT_TYPE_HEX,
+};
+
 enum rdma_nldev_attr {
/* don't change the order or add anything between, this is ABI! */
RDMA_NLDEV_ATTR_UNSPEC,
 
+   /* Pad attribute for 64b alignment */
+   RDMA_NLDEV_ATTR_PAD = RDMA_NLDEV_ATTR_UNSPEC,
+
/* Identifier for ib_device */
RDMA_NLDEV_ATTR_DEV_INDEX,  /* u32 */
 
@@ -387,7 +399,6 @@ enum rdma_nldev_attr {
RDMA_NLDEV_ATTR_RES_PD_ENTRY,   /* nested table */
RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */
RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */
-
/*
 * Provides logical name and index of netdevice which is
 * connected to physical port. This information is relevant
@@ -400,7 +411,24 @@ enum rdma_nldev_attr {
 */
RDMA_NLDEV_ATTR_NDEV_INDEX, /* u32 */
RDMA_NLDEV_ATTR_NDEV_NAME,  /* string */
+   /*
+* driver-specific attributes.
+*/
+   RDMA_NLDEV_ATTR_DRIVER, /* nested table */
+   RDMA_NLDEV_ATTR_DRIVER_ENTRY,   /* nested table */
+   RDMA_NLDEV_ATTR_DRIVER_STRING,  /* string */
+   /*
+* u8 values from enum rdma_nldev_print_type
+*/
+   RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE,  /* u8 */
+   RDMA_NLDEV_ATTR_DRIVER_S32, /* s32 */
+   RDMA_NLDEV_ATTR_DRIVER_U32, /* u32 */
+   RDMA_NLDEV_ATTR_DRIVER_S64, /* s64 */
+   RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */
 
+   /*
+* Always the end
+*/
RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _RDMA_NETLINK_H */
-- 
1.8.3.1



[PATCH v3 iproute2-next 0/3] RDMA tool driver-specific resource tracking

2018-05-16 Thread Steve Wise
This series enhances the iproute2 rdma tool to include displaying
driver-specific resource attributes.  It is the user-space part of the
kernel driver resource tracking series that has been accepted for merging
into linux-4.18 [1]

If there are no additional review comments, it can now be merged, I think.

Changes since v2:
- resync rdma_netlink.h to fix uapi break

Changes since v1:
- commit log editorial fixes
- cite kernel commits that updated rdma_netlink.h in the 
  iproute2 commit syncing this header
- reorder stack definitions ala "reverse christmas tree"
- correctly handle unknown driver attributes when printing

Changes since v0/rfc:
- changed "provider" to "driver" based on kernel side changes
- updated man pages
- removed "RFC" tag

Thanks,

Steve.

[1] https://www.spinics.net/lists/linux-rdma/msg64199.html

Steve Wise (3):
  rdma: update rdma_netlink.h to get new driver attributes
  rdma: print driver resource attributes
  rdma: update man pages

 man/man8/rdma-resource.8  |  29 -
 man/man8/rdma.8   |   2 +-
 rdma/include/uapi/rdma/rdma_netlink.h |  30 +-
 rdma/rdma.c   |   9 +-
 rdma/rdma.h   |  11 ++
 rdma/res.c|  30 ++
 rdma/utils.c  | 196 ++
 7 files changed, 280 insertions(+), 27 deletions(-)

-- 
1.8.3.1



RE: [PATCH v2 iproute2-next 1/3] rdma: update rdma_netlink.h to get new driver attributes

2018-05-15 Thread Steve Wise


> -Original Message-
> From: David Ahern 
> Sent: Tuesday, May 15, 2018 3:02 PM
> To: Doug Ledford ; Steve Wise
> ; l...@kernel.org
> Cc: step...@networkplumber.org; netdev@vger.kernel.org; linux-
> r...@vger.kernel.org
> Subject: Re: [PATCH v2 iproute2-next 1/3] rdma: update rdma_netlink.h to
> get new driver attributes
> 
> On 5/15/18 2:00 PM, Doug Ledford wrote:
> > I just sent an incremental fix to the list under separate cover.  You
> > can squash that fix into Steve's patch and it should resolve the issue.
> > Or Steve can respin the set.  Either way.
> 
> Once the patch has been committed to the upstream repo, spin a new set
> with the correct header update.

Will do!

Steve.



RE: [PATCH v2 iproute2-next 1/3] rdma: update rdma_netlink.h to get new driver attributes

2018-05-15 Thread Steve Wise


> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Doug Ledford
> Sent: Tuesday, May 15, 2018 2:53 PM
> To: David Ahern ; Steve Wise
> ; l...@kernel.org
> Cc: step...@networkplumber.org; netdev@vger.kernel.org; linux-
> r...@vger.kernel.org
> Subject: Re: [PATCH v2 iproute2-next 1/3] rdma: update rdma_netlink.h to
> get new driver attributes
> 
> On Tue, 2018-05-15 at 13:37 -0600, David Ahern wrote:
> > On 5/14/18 9:42 AM, Steve Wise wrote:
> > > diff --git a/rdma/include/uapi/rdma/rdma_netlink.h
> b/rdma/include/uapi/rdma/rdma_netlink.h
> > > index 60416ed..40be0d8 100644
> > > --- a/rdma/include/uapi/rdma/rdma_netlink.h
> > > +++ b/rdma/include/uapi/rdma/rdma_netlink.h
> > >
> > > @@ -387,6 +399,20 @@ enum rdma_nldev_attr {
> > >   RDMA_NLDEV_ATTR_RES_PD_ENTRY,   /* nested table */
> > >   RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */
> > >   RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */
> > > + /*
> > > +  * driver-specific attributes.
> > > +  */
> > > + RDMA_NLDEV_ATTR_DRIVER, /* nested table */
> > > + RDMA_NLDEV_ATTR_DRIVER_ENTRY,   /* nested table */
> > > + RDMA_NLDEV_ATTR_DRIVER_STRING,  /* string */
> > > + /*
> > > +  * u8 values from enum rdma_nldev_print_type
> > > +  */
> > > + RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE,  /* u8 */
> > > + RDMA_NLDEV_ATTR_DRIVER_S32, /* s32 */
> > > + RDMA_NLDEV_ATTR_DRIVER_U32, /* u32 */
> > > + RDMA_NLDEV_ATTR_DRIVER_S64, /* s64 */
> > > + RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */
> >
> > and again here.
> 
> This chunk, however, is a problem.  We'll need to fix that in the kernel
> and in this patch too.

I'll fix this series once I fix the kernel side.  Doug, should I send a patch 
that basically moves the DRIVER attributes to the bottom?

Sorry about missing this!

Steve




RE: [PATCH v2 iproute2-next 1/3] rdma: update rdma_netlink.h to get new driver attributes

2018-05-15 Thread Steve Wise
> On Tue, 2018-05-15 at 13:37 -0600, David Ahern wrote:
> > On 5/14/18 9:42 AM, Steve Wise wrote:
> > > diff --git a/rdma/include/uapi/rdma/rdma_netlink.h
> b/rdma/include/uapi/rdma/rdma_netlink.h
> > > index 60416ed..40be0d8 100644
> > > --- a/rdma/include/uapi/rdma/rdma_netlink.h
> > > +++ b/rdma/include/uapi/rdma/rdma_netlink.h
> > > @@ -249,10 +249,22 @@ enum rdma_nldev_command {
> > >   RDMA_NLDEV_NUM_OPS
> > >  };
> > >
> > > +enum {
> > > + RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16,
> > > +};
> > > +
> > > +enum rdma_nldev_print_type {
> > > + RDMA_NLDEV_PRINT_TYPE_UNSPEC,
> > > + RDMA_NLDEV_PRINT_TYPE_HEX,
> > > +};
> > > +
> > >  enum rdma_nldev_attr {
> > >   /* don't change the order or add anything between, this is ABI! */
> >
> > I asked this before and did not get a response. As the comment above
> > states with an emphasis (!) ...

Sorry David, I missed your question previously. ☹

> >
> > >   RDMA_NLDEV_ATTR_UNSPEC,
> > >
> > > + /* Pad attribute for 64b alignment */
> > > + RDMA_NLDEV_ATTR_PAD = RDMA_NLDEV_ATTR_UNSPEC,
> > > +
> >
> > ... are you really adding new attributes in the middle?
> 
> Not really.  The new item is being explicitly set to the same value as
> the item above it.  It therefore becomes two entries with the same enum
> value.  The rest of the enum is all unchanged.

Correct.

The reason this was done was because a kernel had already been released where 
64b nlattrs were being padded with 0 instead of defining an explicit pad 
attribute.  Jason thought the kernel side should define an explicit PAD 
attribute and use it.  To preserve the ABI we defined it but set it to 0 (aka 
ATTR_UNSPEC aka the first in the enum).  


> 
> > >   /* Identifier for ib_device */
> > >   RDMA_NLDEV_ATTR_DEV_INDEX,  /* u32 */
> > >
> > > @@ -387,6 +399,20 @@ enum rdma_nldev_attr {
> > >   RDMA_NLDEV_ATTR_RES_PD_ENTRY,   /* nested table */
> > >   RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */
> > >   RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */
> > > + /*
> > > +  * driver-specific attributes.
> > > +  */
> > > + RDMA_NLDEV_ATTR_DRIVER, /* nested table */
> > > + RDMA_NLDEV_ATTR_DRIVER_ENTRY,   /* nested table */
> > > + RDMA_NLDEV_ATTR_DRIVER_STRING,  /* string */
> > > + /*
> > > +  * u8 values from enum rdma_nldev_print_type
> > > +  */
> > > + RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE,  /* u8 */
> > > + RDMA_NLDEV_ATTR_DRIVER_S32, /* s32 */
> > > + RDMA_NLDEV_ATTR_DRIVER_U32, /* u32 */
> > > + RDMA_NLDEV_ATTR_DRIVER_S64, /* s64 */
> > > + RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */
> >
> > and again here.
> >

Ugh, this looks like a mistake maybe due to me rebasing and not noticing this 
commit added the name/index attrs.

5b2cc79de878 leo...@mellanox.com RDMA/nldev: Provide netdevice name and index 

Both of these are in -next to be merged upstream together.  

Should I do anything?




RE: [PATCH v1 iproute2-next 2/3] rdma: print driver resource attributes

2018-05-15 Thread Steve Wise
> 
> On Tue, May 15, 2018 at 12:35:34PM -0400, Doug Ledford wrote:
> > On Mon, 2018-05-14 at 09:51 -0500, Steve Wise wrote:
> > >
> > > On 5/13/2018 8:24 AM, Leon Romanovsky wrote:
> > > > On Mon, May 07, 2018 at 08:53:16AM -0700, Steve Wise wrote:
> > > > > This enhancement allows printing rdma device-specific state, if
> provided
> > > > > by the kernel.  This is done in a generic manner, so rdma tool doesn't
> > > >
> > > > Double space between "." and "This".
> > > >
> > > > > need to know about the details of every type of rdma device.
> > > > >
> > > > > Driver attributes for a rdma resource are in the form of  > > > > [print_type], value> tuples, where the key is a string and the value 
> > > > > can
> > > > > be any supported driver attribute.  The print_type attribute, if
> present,
> > > >
> > > > ditto
> > >
> > > I'll fix these.
> >
> > Fix it if you want, but don't do it because Leon told you to.  A double
> > space after period is perfectly acceptable.
> 
> It is very controversial thing [1],
> 
> "Most style guides indicate that single sentence spacing is proper for
> final or published work today, and most publishers require manuscripts
> to be submitted as they will appear in publication—single
> sentence spaced."
> 
> [1] https://en.wikipedia.org/wiki/Sentence_spacing

We're not writing  a manuscript. 😉  Regardless, I made the changes and they are 
in v2 of the patch series, which I think is probably ready to merge.

Steve.



RE: [PATCH v1 iproute2-next 2/3] rdma: print driver resource attributes

2018-05-15 Thread Steve Wise
> On Tue, May 15, 2018 at 09:31:27AM -0500, Steve Wise wrote:
> > > cap net admin is not high enough privledge to see unhashed kernel
> > > pointers. CAP_RAW_IO? Or follow what printk does?
> > >
> >
> > Do you mean CAP_NET_RAW?  Here's the comments for it:
> 
> Nope..
> 
> > Func restricted_pointer() from lib/vsprintf.c uses CAP_SYSLOG.  The
> comment for CAP_SYSLOG:
> 
> Yikes, yes, that is probably the required logic here, including the
> kptr_restrict = 0 thing
> 

Let's defer the ktpr_restrict issue for now; I want to finish the initial
work this cycle, and adding that will likely take too much time.   I'll use
CAP_SYSLOG and add a FIXME comment.  Ok? 

Steve.



RE: [PATCH v1 iproute2-next 2/3] rdma: print driver resource attributes

2018-05-15 Thread Steve Wise

> From: Jason Gunthorpe 
> On Tue, May 15, 2018 at 08:18:51AM -0500, Steve Wise wrote:
> >
> > > > On Mon, May 14, 2018 at 05:04:26PM -0500, Steve Wise wrote:
> > > > >
> > > > >
> > > > > On 5/14/2018 3:41 PM, Jason Gunthorpe wrote:
> > > > > > On Mon, May 07, 2018 at 08:53:16AM -0700, Steve Wise wrote:
> > > > > >> This enhancement allows printing rdma device-specific state, if
> > > provided
> > > > > >> by the kernel.  This is done in a generic manner, so rdma tool
> > doesn't
> > > > > >> need to know about the details of every type of rdma device.
> > > > > >>
> > > > > >> Driver attributes for a rdma resource are in the form of  > > > > >> [print_type], value> tuples, where the key is a string and the
> > value can
> > > > > >> be any supported driver attribute.  The print_type attribute, if
> > present,
> > > > > >> provides a print format to use vs the standard print format for the
> > > type.
> > > > > >> For example, the default print type for a PROVIDER_S32 value is
> "%d
> > ",
> > > > > >> but "0x%x " if the print_type of PRINT_TYPE_HEX is included inthe
> > > tuple.
> > > > > >>
> > > > > >> Driver resources are only printed when the -dd flag is present.
> > > > > >> If -p is present, then the output is formatted to not exceed 80
> > > columns,
> > > > > >> otherwise it is printed as a single row to be grep/awk friendly.
> > > > > >>
> > > > > >> Example output:
> > > > > >>
> > > > > >> # rdma resource show qp lqpn 1028 -dd -p
> > > > > >> link cxgb4_0/- lqpn 1028 rqpn 0 type RC state RTS rq-psn 0 sq-psn 0
> > > > path-mig-state MIGRATED pid 0 comm [nvme_rdma]
> > > > > >> sqid 1028 flushed 0 memsize 123968 cidx 85 pidx 85 wq_pidx 106
> > > > flush_cidx 85 in_use 0
> > > > > >> size 386 flags 0x0 rqid 1029 memsize 16768 cidx 43 pidx 41
> > wq_pidx
> > > > 171 msn 44 rqt_hwaddr 0x2a8a5d00
> > > > > >> rqt_size 256 in_use 128 size 130 idx 43 wr_id
> > 0x881057c03408 idx
> > > > 40 wr_id 0x881057c033f0
> > > > > > Hey some of these look like kernel pointers.. That is a no-no.. What
> > > > > > is up there?
> > > > >
> > > > > Nothing is defined as a kernel pointer.  But wr_id is often a pointer
> > to
> > > > > the kernel rdma application's context...
> > > > >
> > > > > > The wr_id often contains a pointer, right? So we cannot just pass it
> > > > > > to user space..
> > > > >
> > > > > Hmm.  It is useful for debugging kernel rdma applications.  Perhaps
> > > > > these attrs can be only be sent up by the kernel if the capabilities
> > > > > allow.  But previous review comments of the kernel series, which is
> > now
> > > > > merged, forced me to remove passing the capabilities information to
> > the
> > > > > driver resource fill functions.
> > > > >
> > > > > So what's the right way to do this?
> > > >
> > > > The reviewer asked do not pass to drivers whole CAP_.. bits, because
> > > > they anyway don't need such granularity.
> > > >
> > >
> > > Ok thanks.
> >
> > How's this?
> >
> > diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
> > index 6379685..2cf9c5c 100644
> > +++ b/include/rdma/restrack.h
> > @@ -66,7 +66,8 @@ struct rdma_restrack_root {
> >  * Allows rdma drivers to add their own restrack attributes.
> >  */
> > int (*fill_res_entry)(struct sk_buff *msg,
> > - struct rdma_restrack_entry *entry);
> > + struct rdma_restrack_entry *entry,
> > + bool net_admin_capable);
> >  };
> 
> cap net admin is not high enough privledge to see unhashed kernel
> pointers. CAP_RAW_IO? Or follow what printk does?
> 

Do you mean CAP_NET_RAW?  Here's the comments for it:

/* Allow use of RAW sockets */
/* Allow use of PACKET sockets */
/* Allow binding to any address for transparent proxying (also via NET_ADMIN) */


Func restricted_pointer() from lib/vsprintf.c uses CAP_SYSLOG.  The comment for 
CAP_SYSLOG:

/* Allow configuring the kernel's syslog (printk behaviour) */

Func kallsyms_show_value() also uses CAP_SYSLOG.  And there is a non-exported 
global kptr_restrict that they use apparently to allow overriding all this for 
profiling.

Here is NET_ADMIN's comments:

/* Allow interface configuration */
/* Allow administration of IP firewall, masquerading and accounting */
/* Allow setting debug option on sockets */
/* Allow modification of routing tables */
/* Allow setting arbitrary process / process group ownership on
   sockets */
/* Allow binding to any address for transparent proxying (also via NET_RAW) */
/* Allow setting TOS (type of service) */
/* Allow setting promiscuous mode */
/* Allow clearing driver statistics */
/* Allow multicasting */
/* Allow read/write of device-specific registers */
/* Allow activation of ATM control sockets */



RE: [PATCH v1 iproute2-next 2/3] rdma: print driver resource attributes

2018-05-15 Thread Steve Wise
> On Mon, May 14, 2018 at 05:04:26PM -0500, Steve Wise wrote:
> >
> >
> > On 5/14/2018 3:41 PM, Jason Gunthorpe wrote:
> > > On Mon, May 07, 2018 at 08:53:16AM -0700, Steve Wise wrote:
> > >> This enhancement allows printing rdma device-specific state, if
provided
> > >> by the kernel.  This is done in a generic manner, so rdma tool
doesn't
> > >> need to know about the details of every type of rdma device.
> > >>
> > >> Driver attributes for a rdma resource are in the form of  > >> [print_type], value> tuples, where the key is a string and the value
can
> > >> be any supported driver attribute.  The print_type attribute, if
present,
> > >> provides a print format to use vs the standard print format for the
type.
> > >> For example, the default print type for a PROVIDER_S32 value is "%d
",
> > >> but "0x%x " if the print_type of PRINT_TYPE_HEX is included inthe
tuple.
> > >>
> > >> Driver resources are only printed when the -dd flag is present.
> > >> If -p is present, then the output is formatted to not exceed 80
columns,
> > >> otherwise it is printed as a single row to be grep/awk friendly.
> > >>
> > >> Example output:
> > >>
> > >> # rdma resource show qp lqpn 1028 -dd -p
> > >> link cxgb4_0/- lqpn 1028 rqpn 0 type RC state RTS rq-psn 0 sq-psn 0
> path-mig-state MIGRATED pid 0 comm [nvme_rdma]
> > >> sqid 1028 flushed 0 memsize 123968 cidx 85 pidx 85 wq_pidx 106
> flush_cidx 85 in_use 0
> > >> size 386 flags 0x0 rqid 1029 memsize 16768 cidx 43 pidx 41
wq_pidx
> 171 msn 44 rqt_hwaddr 0x2a8a5d00
> > >> rqt_size 256 in_use 128 size 130 idx 43 wr_id 0x881057c03408
idx
> 40 wr_id 0x881057c033f0
> > > Hey some of these look like kernel pointers.. That is a no-no.. What
> > > is up there?
> >
> > Nothing is defined as a kernel pointer.  But wr_id is often a pointer to
> > the kernel rdma application's context...
> >
> > > The wr_id often contains a pointer, right? So we cannot just pass it
> > > to user space..
> >
> > Hmm.  It is useful for debugging kernel rdma applications.  Perhaps
> > these attrs can be only be sent up by the kernel if the capabilities
> > allow.  But previous review comments of the kernel series, which is now
> > merged, forced me to remove passing the capabilities information to the
> > driver resource fill functions.
> >
> > So what's the right way to do this?
> 
> The reviewer asked do not pass to drivers whole CAP_.. bits, because
> they anyway don't need such granularity.
> 

Ok thanks.



RE: [PATCH v1 iproute2-next 2/3] rdma: print driver resource attributes

2018-05-15 Thread Steve Wise
 
> > On Mon, May 14, 2018 at 05:04:26PM -0500, Steve Wise wrote:
> > >
> > >
> > > On 5/14/2018 3:41 PM, Jason Gunthorpe wrote:
> > > > On Mon, May 07, 2018 at 08:53:16AM -0700, Steve Wise wrote:
> > > >> This enhancement allows printing rdma device-specific state, if
> provided
> > > >> by the kernel.  This is done in a generic manner, so rdma tool
doesn't
> > > >> need to know about the details of every type of rdma device.
> > > >>
> > > >> Driver attributes for a rdma resource are in the form of  > > >> [print_type], value> tuples, where the key is a string and the
value can
> > > >> be any supported driver attribute.  The print_type attribute, if
present,
> > > >> provides a print format to use vs the standard print format for the
> type.
> > > >> For example, the default print type for a PROVIDER_S32 value is "%d
",
> > > >> but "0x%x " if the print_type of PRINT_TYPE_HEX is included inthe
> tuple.
> > > >>
> > > >> Driver resources are only printed when the -dd flag is present.
> > > >> If -p is present, then the output is formatted to not exceed 80
> columns,
> > > >> otherwise it is printed as a single row to be grep/awk friendly.
> > > >>
> > > >> Example output:
> > > >>
> > > >> # rdma resource show qp lqpn 1028 -dd -p
> > > >> link cxgb4_0/- lqpn 1028 rqpn 0 type RC state RTS rq-psn 0 sq-psn 0
> > path-mig-state MIGRATED pid 0 comm [nvme_rdma]
> > > >> sqid 1028 flushed 0 memsize 123968 cidx 85 pidx 85 wq_pidx 106
> > flush_cidx 85 in_use 0
> > > >> size 386 flags 0x0 rqid 1029 memsize 16768 cidx 43 pidx 41
wq_pidx
> > 171 msn 44 rqt_hwaddr 0x2a8a5d00
> > > >> rqt_size 256 in_use 128 size 130 idx 43 wr_id
0x881057c03408 idx
> > 40 wr_id 0x881057c033f0
> > > > Hey some of these look like kernel pointers.. That is a no-no.. What
> > > > is up there?
> > >
> > > Nothing is defined as a kernel pointer.  But wr_id is often a pointer
to
> > > the kernel rdma application's context...
> > >
> > > > The wr_id often contains a pointer, right? So we cannot just pass it
> > > > to user space..
> > >
> > > Hmm.  It is useful for debugging kernel rdma applications.  Perhaps
> > > these attrs can be only be sent up by the kernel if the capabilities
> > > allow.  But previous review comments of the kernel series, which is
now
> > > merged, forced me to remove passing the capabilities information to
the
> > > driver resource fill functions.
> > >
> > > So what's the right way to do this?
> >
> > The reviewer asked do not pass to drivers whole CAP_.. bits, because
> > they anyway don't need such granularity.
> >
> 
> Ok thanks.

How's this?

diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index 6379685..2cf9c5c 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -66,7 +66,8 @@ struct rdma_restrack_root {
 * Allows rdma drivers to add their own restrack attributes.
 */
int (*fill_res_entry)(struct sk_buff *msg,
- struct rdma_restrack_entry *entry);
+ struct rdma_restrack_entry *entry,
+ bool net_admin_capable);
 };



Re: [PATCH v1 iproute2-next 2/3] rdma: print driver resource attributes

2018-05-14 Thread Steve Wise


On 5/14/2018 3:41 PM, Jason Gunthorpe wrote:
> On Mon, May 07, 2018 at 08:53:16AM -0700, Steve Wise wrote:
>> This enhancement allows printing rdma device-specific state, if provided
>> by the kernel.  This is done in a generic manner, so rdma tool doesn't
>> need to know about the details of every type of rdma device.
>>
>> Driver attributes for a rdma resource are in the form of > [print_type], value> tuples, where the key is a string and the value can
>> be any supported driver attribute.  The print_type attribute, if present,
>> provides a print format to use vs the standard print format for the type.
>> For example, the default print type for a PROVIDER_S32 value is "%d ",
>> but "0x%x " if the print_type of PRINT_TYPE_HEX is included inthe tuple.
>>
>> Driver resources are only printed when the -dd flag is present.
>> If -p is present, then the output is formatted to not exceed 80 columns,
>> otherwise it is printed as a single row to be grep/awk friendly.
>>
>> Example output:
>>
>> # rdma resource show qp lqpn 1028 -dd -p
>> link cxgb4_0/- lqpn 1028 rqpn 0 type RC state RTS rq-psn 0 sq-psn 0 
>> path-mig-state MIGRATED pid 0 comm [nvme_rdma]
>> sqid 1028 flushed 0 memsize 123968 cidx 85 pidx 85 wq_pidx 106 
>> flush_cidx 85 in_use 0
>> size 386 flags 0x0 rqid 1029 memsize 16768 cidx 43 pidx 41 wq_pidx 171 
>> msn 44 rqt_hwaddr 0x2a8a5d00
>> rqt_size 256 in_use 128 size 130 idx 43 wr_id 0x881057c03408 idx 40 
>> wr_id 0x881057c033f0
> Hey some of these look like kernel pointers.. That is a no-no.. What
> is up there?

Nothing is defined as a kernel pointer.  But wr_id is often a pointer to
the kernel rdma application's context...

> The wr_id often contains a pointer, right? So we cannot just pass it
> to user space..

Hmm.  It is useful for debugging kernel rdma applications.  Perhaps
these attrs can be only be sent up by the kernel if the capabilities
allow.  But previous review comments of the kernel series, which is now
merged, forced me to remove passing the capabilities information to the
driver resource fill functions. 

So what's the right way to do this?

Steve.




[PATCH v2 iproute2-next 3/3] rdma: update man pages

2018-05-14 Thread Steve Wise
Update the man pages for the resource attributes as well
as the driver-specific attributes.

Signed-off-by: Steve Wise 
---
 man/man8/rdma-resource.8 | 29 ++---
 man/man8/rdma.8  |  2 +-
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/man/man8/rdma-resource.8 b/man/man8/rdma-resource.8
index ff5d25d..40b073d 100644
--- a/man/man8/rdma-resource.8
+++ b/man/man8/rdma-resource.8
@@ -7,13 +7,16 @@ rdma-resource \- rdma resource configuration
 .in +8
 .ti -8
 .B rdma
-.RI "[ " OPTIONS " ]"
-.B resource
-.RI  " { " COMMAND " | "
+.RI "[ " OPTIONS " ] " RESOURCE " { " COMMAND " | "
 .BR help " }"
 .sp
 
 .ti -8
+.IR RESOURCE " := { "
+.BR cm_id " | " cq " | " mr " | " pd " | " qp " }"
+.sp
+
+.ti -8
 .IR OPTIONS " := { "
 \fB\-j\fR[\fIson\fR] |
 \fB\-d\fR[\fIetails\fR] }
@@ -70,11 +73,31 @@ rdma res show qp link mlx5_4/- -d
 Detailed view.
 .RE
 .PP
+rdma res show qp link mlx5_4/- -dd
+.RS 4
+Detailed view including driver-specific details.
+.RE
+.PP
 rdma res show qp link mlx5_4/1 lqpn 0-6
 .RS 4
 Limit to specific Local QPNs.
 .RE
 .PP
+rdma resource show cm_id dst-port 7174
+.RS 4
+Show CM_IDs with destination ip port of 7174.
+.RE
+.PP
+rdma resource show cm_id src-addr 172.16.0.100
+.RS 4
+Show CM_IDs bound to local ip address 172.16.0.100
+.RE
+.PP
+rdma resource show cq pid 30489
+.RS 4
+Show CQs belonging to pid 30489
+.RE
+.PP
 
 .SH SEE ALSO
 .BR rdma (8),
diff --git a/man/man8/rdma.8 b/man/man8/rdma.8
index 6f88f37..12aa149 100644
--- a/man/man8/rdma.8
+++ b/man/man8/rdma.8
@@ -49,7 +49,7 @@ If there were any errors during execution of the commands, 
the application retur
 
 .TP
 .BR "\-d" , " --details"
-Output detailed information.
+Output detailed information.  Adding a second \-d includes driver-specific 
details.
 
 .TP
 .BR "\-p" , " --pretty"
-- 
1.8.3.1



[PATCH v2 iproute2-next 0/3] RDMA tool driver-specific resource tracking

2018-05-14 Thread Steve Wise
Hello,

This series enhances the iproute2 rdma tool to include displaying
driver-specific resource attributes.  It is the user-space part of the
kernel driver resource tracking series that has been accepted for merging
into linux-4.18 [1]

If there are no additional review comments, it can now be merged, I think.

Changes since v1:
- commit log editorial fixes
- cite kernel commits that updated rdma_netlink.h in the 
  iproute2 commit syncing this header
- reorder stack definitions ala "reverse christmas tree"
- correctly handle unknown driver attributes when printing

Changes since v0/rfc:
- changed "provider" to "driver" based on kernel side changes
- updated man pages
- removed "RFC" tag

Thanks,

Steve.

[1] https://www.spinics.net/lists/linux-rdma/msg64199.html

Steve Wise (3):
  rdma: update rdma_netlink.h to get new driver attributes
  rdma: print driver resource attributes
  rdma: update man pages

 man/man8/rdma-resource.8  |  29 -
 man/man8/rdma.8   |   2 +-
 rdma/include/uapi/rdma/rdma_netlink.h |  26 +
 rdma/rdma.c   |   9 +-
 rdma/rdma.h   |  11 ++
 rdma/res.c|  30 ++
 rdma/utils.c  | 196 ++
 7 files changed, 277 insertions(+), 26 deletions(-)

-- 
1.8.3.1



[PATCH v2 iproute2-next 1/3] rdma: update rdma_netlink.h to get new driver attributes

2018-05-14 Thread Steve Wise
Pull in the rdma_netlink.h changes from kernel
commits:

25a0ad85156a ("RDMA/nldev: Add explicit pad attribute")
da5c85078215 ("RDMA/nldev: add driver-specific resource tracking)"

Signed-off-by: Steve Wise 
---
 rdma/include/uapi/rdma/rdma_netlink.h | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/rdma/include/uapi/rdma/rdma_netlink.h 
b/rdma/include/uapi/rdma/rdma_netlink.h
index 60416ed..40be0d8 100644
--- a/rdma/include/uapi/rdma/rdma_netlink.h
+++ b/rdma/include/uapi/rdma/rdma_netlink.h
@@ -249,10 +249,22 @@ enum rdma_nldev_command {
RDMA_NLDEV_NUM_OPS
 };
 
+enum {
+   RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16,
+};
+
+enum rdma_nldev_print_type {
+   RDMA_NLDEV_PRINT_TYPE_UNSPEC,
+   RDMA_NLDEV_PRINT_TYPE_HEX,
+};
+
 enum rdma_nldev_attr {
/* don't change the order or add anything between, this is ABI! */
RDMA_NLDEV_ATTR_UNSPEC,
 
+   /* Pad attribute for 64b alignment */
+   RDMA_NLDEV_ATTR_PAD = RDMA_NLDEV_ATTR_UNSPEC,
+
/* Identifier for ib_device */
RDMA_NLDEV_ATTR_DEV_INDEX,  /* u32 */
 
@@ -387,6 +399,20 @@ enum rdma_nldev_attr {
RDMA_NLDEV_ATTR_RES_PD_ENTRY,   /* nested table */
RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */
RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */
+   /*
+* driver-specific attributes.
+*/
+   RDMA_NLDEV_ATTR_DRIVER, /* nested table */
+   RDMA_NLDEV_ATTR_DRIVER_ENTRY,   /* nested table */
+   RDMA_NLDEV_ATTR_DRIVER_STRING,  /* string */
+   /*
+* u8 values from enum rdma_nldev_print_type
+*/
+   RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE,  /* u8 */
+   RDMA_NLDEV_ATTR_DRIVER_S32, /* s32 */
+   RDMA_NLDEV_ATTR_DRIVER_U32, /* u32 */
+   RDMA_NLDEV_ATTR_DRIVER_S64, /* s64 */
+   RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */
 
/*
 * Provides logical name and index of netdevice which is
-- 
1.8.3.1



[PATCH v2 iproute2-next 2/3] rdma: print driver resource attributes

2018-05-14 Thread Steve Wise
This enhancement allows printing rdma device-specific state, if provided
by the kernel. This is done in a generic manner, so rdma tool doesn't
need to know about the details of every type of rdma device.

Driver attributes for a rdma resource are in the form of  tuples, where the key is a string and the value can
be any supported driver attribute. The print_type attribute, if present,
provides a print format to use vs the standard print format for the type.
For example, the default print type for a PROVIDER_S32 value is "%d ",
but "0x%x " if the print_type of PRINT_TYPE_HEX is included inthe tuple.

Driver resources are only printed when the -dd flag is present.
If -p is present, then the output is formatted to not exceed 80 columns,
otherwise it is printed as a single row to be grep/awk friendly.

Example output:

# rdma resource show qp lqpn 1028 -dd -p
link cxgb4_0/- lqpn 1028 rqpn 0 type RC state RTS rq-psn 0 sq-psn 0 
path-mig-state MIGRATED pid 0 comm [nvme_rdma]
sqid 1028 flushed 0 memsize 123968 cidx 85 pidx 85 wq_pidx 106 flush_cidx 
85 in_use 0
size 386 flags 0x0 rqid 1029 memsize 16768 cidx 43 pidx 41 wq_pidx 171 msn 
44 rqt_hwaddr 0x2a8a5d00
rqt_size 256 in_use 128 size 130 idx 43 wr_id 0x881057c03408 idx 40 
wr_id 0x881057c033f0

Signed-off-by: Steve Wise 
---
 rdma/rdma.c  |   9 ++-
 rdma/rdma.h  |  11 
 rdma/res.c   |  30 +++--
 rdma/utils.c | 196 +++
 4 files changed, 224 insertions(+), 22 deletions(-)

diff --git a/rdma/rdma.c b/rdma/rdma.c
index b43e538..010e983 100644
--- a/rdma/rdma.c
+++ b/rdma/rdma.c
@@ -129,13 +129,14 @@ int main(int argc, char **argv)
{ "batch",  required_argument,  NULL, 'b' },
{ NULL, 0, NULL, 0 }
};
+   bool show_driver_details = false;
const char *batch_file = NULL;
bool pretty_output = false;
bool show_details = false;
bool json_output = false;
bool force = false;
-   char *filename;
struct rd rd = {};
+   char *filename;
int opt;
int err;
 
@@ -152,7 +153,10 @@ int main(int argc, char **argv)
pretty_output = true;
break;
case 'd':
-   show_details = true;
+   if (show_details)
+   show_driver_details = true;
+   else
+   show_details = true;
break;
case 'j':
json_output = true;
@@ -180,6 +184,7 @@ int main(int argc, char **argv)
argv += optind;
 
rd.show_details = show_details;
+   rd.show_driver_details = show_driver_details;
rd.json_output = json_output;
rd.pretty_output = pretty_output;
 
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 1908fc4..fcaf9e6 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -55,6 +55,7 @@ struct rd {
char **argv;
char *filename;
bool show_details;
+   bool show_driver_details;
struct list_head dev_map_list;
uint32_t dev_idx;
uint32_t port_idx;
@@ -115,4 +116,14 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void 
*data, uint32_t seq);
 void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t 
flags);
 int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
 int rd_attr_cb(const struct nlattr *attr, void *data);
+int rd_attr_check(const struct nlattr *attr, int *typep);
+
+/*
+ * Print helpers
+ */
+void print_driver_table(struct rd *rd, struct nlattr *tb);
+void newline(struct rd *rd);
+void newline_indent(struct rd *rd);
+#define MAX_LINE_LENGTH 80
+
 #endif /* _RDMA_TOOL_H_ */
diff --git a/rdma/res.c b/rdma/res.c
index 1a0aab6..074b992 100644
--- a/rdma/res.c
+++ b/rdma/res.c
@@ -439,10 +439,8 @@ static int res_qp_parse_cb(const struct nlmsghdr *nlh, 
void *data)
if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
free(comm);
 
-   if (rd->json_output)
-   jsonw_end_array(rd->jw);
-   else
-   pr_out("\n");
+   print_driver_table(rd, nla_line[RDMA_NLDEV_ATTR_DRIVER]);
+   newline(rd);
}
return MNL_CB_OK;
 }
@@ -678,10 +676,8 @@ static int res_cm_id_parse_cb(const struct nlmsghdr *nlh, 
void *data)
if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
free(comm);
 
-   if (rd->json_output)
-   jsonw_end_array(rd->jw);
-   else
-   pr_out("\n");
+   print_driver_table(rd, nla_line[RDMA_NLDEV_ATTR_DRIVER]);
+   newline(rd);
}
return MNL_CB_OK;
 }
@@ -804,10 +800,8 @@ static int res_cq_parse_cb(const struct nlmsghdr *nlh,

Re: [PATCH v1 iproute2-next 1/3] rdma: update rdma_netlink.h to get driver attrs

2018-05-14 Thread Steve Wise


On 5/13/2018 8:15 AM, Leon Romanovsky wrote:
> On Mon, May 07, 2018 at 08:53:10AM -0700, Steve Wise wrote:
>> Signed-off-by: Steve Wise 
>> ---
>>  rdma/include/uapi/rdma/rdma_netlink.h | 37 
>> ++-
>>  1 file changed, 36 insertions(+), 1 deletion(-)
> Please write in commit message something like: "Based on kernel commit
> ", so we will be able to track changes.

Sure.

Thanks,

Steve.


Re: [PATCH v1 iproute2-next 2/3] rdma: print driver resource attributes

2018-05-14 Thread Steve Wise


On 5/13/2018 8:24 AM, Leon Romanovsky wrote:
> On Mon, May 07, 2018 at 08:53:16AM -0700, Steve Wise wrote:
>> This enhancement allows printing rdma device-specific state, if provided
>> by the kernel.  This is done in a generic manner, so rdma tool doesn't
> Double space between "." and "This".
>
>> need to know about the details of every type of rdma device.
>>
>> Driver attributes for a rdma resource are in the form of > [print_type], value> tuples, where the key is a string and the value can
>> be any supported driver attribute.  The print_type attribute, if present,
> ditto

I'll fix these.

>> provides a print format to use vs the standard print format for the type.
>> For example, the default print type for a PROVIDER_S32 value is "%d ",
>> but "0x%x " if the print_type of PRINT_TYPE_HEX is included inthe tuple.
>>
>> Driver resources are only printed when the -dd flag is present.
>> If -p is present, then the output is formatted to not exceed 80 columns,
>> otherwise it is printed as a single row to be grep/awk friendly.
>>
>> Example output:
>>
>> # rdma resource show qp lqpn 1028 -dd -p
>> link cxgb4_0/- lqpn 1028 rqpn 0 type RC state RTS rq-psn 0 sq-psn 0 
>> path-mig-state MIGRATED pid 0 comm [nvme_rdma]
>> sqid 1028 flushed 0 memsize 123968 cidx 85 pidx 85 wq_pidx 106 
>> flush_cidx 85 in_use 0
>> size 386 flags 0x0 rqid 1029 memsize 16768 cidx 43 pidx 41 wq_pidx 171 
>> msn 44 rqt_hwaddr 0x2a8a5d00
>> rqt_size 256 in_use 128 size 130 idx 43 wr_id 0x881057c03408 idx 40 
>> wr_id 0x881057c033f0
>>
>> Signed-off-by: Steve Wise 
>> ---
>>  rdma/rdma.c  |   7 ++-
>>  rdma/rdma.h  |  11 
>>  rdma/res.c   |  30 +++--
>>  rdma/utils.c | 194 
>> +++
>>  4 files changed, 221 insertions(+), 21 deletions(-)
>>
>> diff --git a/rdma/rdma.c b/rdma/rdma.c
>> index b43e538..0155627 100644
>> --- a/rdma/rdma.c
>> +++ b/rdma/rdma.c
>> @@ -132,6 +132,7 @@ int main(int argc, char **argv)
>>  const char *batch_file = NULL;
>>  bool pretty_output = false;
>>  bool show_details = false;
>> +bool show_driver_details = false;
> Reversed Christmas tree please.

Sure. 

>>  bool json_output = false;
>>  bool force = false;
>>  char *filename;
>> @@ -152,7 +153,10 @@ int main(int argc, char **argv)
>>  pretty_output = true;
>>  break;
>>  case 'd':
>> -show_details = true;
>> +if (show_details)
>> +show_driver_details = true;
>> +else
>> +show_details = true;
>>  break;
>>  case 'j':
>>  json_output = true;
>> @@ -180,6 +184,7 @@ int main(int argc, char **argv)
>>  argv += optind;
>>
>>  rd.show_details = show_details;
>> +rd.show_driver_details = show_driver_details;
>>  rd.json_output = json_output;
>>  rd.pretty_output = pretty_output;
>>
>> diff --git a/rdma/rdma.h b/rdma/rdma.h
>> index 1908fc4..fcaf9e6 100644
>> --- a/rdma/rdma.h
>> +++ b/rdma/rdma.h
>> @@ -55,6 +55,7 @@ struct rd {
>>  char **argv;
>>  char *filename;
>>  bool show_details;
>> +bool show_driver_details;
>>  struct list_head dev_map_list;
>>  uint32_t dev_idx;
>>  uint32_t port_idx;
>> @@ -115,4 +116,14 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void 
>> *data, uint32_t seq);
>>  void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t 
>> flags);
>>  int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
>>  int rd_attr_cb(const struct nlattr *attr, void *data);
>> +int rd_attr_check(const struct nlattr *attr, int *typep);
>> +
>> +/*
>> + * Print helpers
>> + */
>> +void print_driver_table(struct rd *rd, struct nlattr *tb);
>> +void newline(struct rd *rd);
>> +void newline_indent(struct rd *rd);
>> +#define MAX_LINE_LENGTH 80
>> +
>>  #endif /* _RDMA_TOOL_H_ */
>> diff --git a/rdma/res.c b/rdma/res.c
>> index 1a0aab6..074b992 100644
>> --- a/rdma/res.c
>> +++ b/rdma/res.c
>> @@ -439,10 +439,8 @@ static int res_qp_parse_cb(const struct nlmsghdr *nlh, 
>> void *data)
>>  if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
>>   

Re: [PATCH v1 iproute2-next 2/3] rdma: print driver resource attributes

2018-05-10 Thread Steve Wise

On 5/9/2018 11:08 PM, David Ahern wrote:
> On 5/7/18 9:53 AM, Steve Wise wrote:
>> @@ -152,7 +153,10 @@ int main(int argc, char **argv)
>>  pretty_output = true;
>>  break;
>>  case 'd':
>> -show_details = true;
>> +if (show_details)
>> +show_driver_details = true;
>> +else
>> +show_details = true;
>>  break;
>>  case 'j':
>>  json_output = true;
> The above change should be reflected in the man page.

I did mention it in the man page:

   -d, --details
  Output detailed information.  Adding a second -d includes
driver-specific details.

But I wasn't sure how to show it in the syntax.  Maybe this?

 OPTIONS := { -V[ersion] | -d[etails] [-d[etails]] } -j[son] } -p[retty] }


> Also, the set needs to be respun after I merged master where Stephen
> brought in updates to the uapi files.

Will do.  Thanks for reviewing.

Steve.


[PATCH v1 iproute2-next 3/3] rdma: update man pages

2018-05-07 Thread Steve Wise
Update the man pages for the resource attributes as well
as the driver-specific attributes.

Signed-off-by: Steve Wise 
---
 man/man8/rdma-resource.8 | 29 ++---
 man/man8/rdma.8  |  2 +-
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/man/man8/rdma-resource.8 b/man/man8/rdma-resource.8
index ff5d25d..40b073d 100644
--- a/man/man8/rdma-resource.8
+++ b/man/man8/rdma-resource.8
@@ -7,13 +7,16 @@ rdma-resource \- rdma resource configuration
 .in +8
 .ti -8
 .B rdma
-.RI "[ " OPTIONS " ]"
-.B resource
-.RI  " { " COMMAND " | "
+.RI "[ " OPTIONS " ] " RESOURCE " { " COMMAND " | "
 .BR help " }"
 .sp
 
 .ti -8
+.IR RESOURCE " := { "
+.BR cm_id " | " cq " | " mr " | " pd " | " qp " }"
+.sp
+
+.ti -8
 .IR OPTIONS " := { "
 \fB\-j\fR[\fIson\fR] |
 \fB\-d\fR[\fIetails\fR] }
@@ -70,11 +73,31 @@ rdma res show qp link mlx5_4/- -d
 Detailed view.
 .RE
 .PP
+rdma res show qp link mlx5_4/- -dd
+.RS 4
+Detailed view including driver-specific details.
+.RE
+.PP
 rdma res show qp link mlx5_4/1 lqpn 0-6
 .RS 4
 Limit to specific Local QPNs.
 .RE
 .PP
+rdma resource show cm_id dst-port 7174
+.RS 4
+Show CM_IDs with destination ip port of 7174.
+.RE
+.PP
+rdma resource show cm_id src-addr 172.16.0.100
+.RS 4
+Show CM_IDs bound to local ip address 172.16.0.100
+.RE
+.PP
+rdma resource show cq pid 30489
+.RS 4
+Show CQs belonging to pid 30489
+.RE
+.PP
 
 .SH SEE ALSO
 .BR rdma (8),
diff --git a/man/man8/rdma.8 b/man/man8/rdma.8
index 6f88f37..12aa149 100644
--- a/man/man8/rdma.8
+++ b/man/man8/rdma.8
@@ -49,7 +49,7 @@ If there were any errors during execution of the commands, 
the application retur
 
 .TP
 .BR "\-d" , " --details"
-Output detailed information.
+Output detailed information.  Adding a second \-d includes driver-specific 
details.
 
 .TP
 .BR "\-p" , " --pretty"
-- 
1.8.3.1



[PATCH v1 iproute2-next 2/3] rdma: print driver resource attributes

2018-05-07 Thread Steve Wise
This enhancement allows printing rdma device-specific state, if provided
by the kernel.  This is done in a generic manner, so rdma tool doesn't
need to know about the details of every type of rdma device.

Driver attributes for a rdma resource are in the form of  tuples, where the key is a string and the value can
be any supported driver attribute.  The print_type attribute, if present,
provides a print format to use vs the standard print format for the type.
For example, the default print type for a PROVIDER_S32 value is "%d ",
but "0x%x " if the print_type of PRINT_TYPE_HEX is included inthe tuple.

Driver resources are only printed when the -dd flag is present.
If -p is present, then the output is formatted to not exceed 80 columns,
otherwise it is printed as a single row to be grep/awk friendly.

Example output:

# rdma resource show qp lqpn 1028 -dd -p
link cxgb4_0/- lqpn 1028 rqpn 0 type RC state RTS rq-psn 0 sq-psn 0 
path-mig-state MIGRATED pid 0 comm [nvme_rdma]
sqid 1028 flushed 0 memsize 123968 cidx 85 pidx 85 wq_pidx 106 flush_cidx 
85 in_use 0
size 386 flags 0x0 rqid 1029 memsize 16768 cidx 43 pidx 41 wq_pidx 171 msn 
44 rqt_hwaddr 0x2a8a5d00
rqt_size 256 in_use 128 size 130 idx 43 wr_id 0x881057c03408 idx 40 
wr_id 0x881057c033f0

Signed-off-by: Steve Wise 
---
 rdma/rdma.c  |   7 ++-
 rdma/rdma.h  |  11 
 rdma/res.c   |  30 +++--
 rdma/utils.c | 194 +++
 4 files changed, 221 insertions(+), 21 deletions(-)

diff --git a/rdma/rdma.c b/rdma/rdma.c
index b43e538..0155627 100644
--- a/rdma/rdma.c
+++ b/rdma/rdma.c
@@ -132,6 +132,7 @@ int main(int argc, char **argv)
const char *batch_file = NULL;
bool pretty_output = false;
bool show_details = false;
+   bool show_driver_details = false;
bool json_output = false;
bool force = false;
char *filename;
@@ -152,7 +153,10 @@ int main(int argc, char **argv)
pretty_output = true;
break;
case 'd':
-   show_details = true;
+   if (show_details)
+   show_driver_details = true;
+   else
+   show_details = true;
break;
case 'j':
json_output = true;
@@ -180,6 +184,7 @@ int main(int argc, char **argv)
argv += optind;
 
rd.show_details = show_details;
+   rd.show_driver_details = show_driver_details;
rd.json_output = json_output;
rd.pretty_output = pretty_output;
 
diff --git a/rdma/rdma.h b/rdma/rdma.h
index 1908fc4..fcaf9e6 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -55,6 +55,7 @@ struct rd {
char **argv;
char *filename;
bool show_details;
+   bool show_driver_details;
struct list_head dev_map_list;
uint32_t dev_idx;
uint32_t port_idx;
@@ -115,4 +116,14 @@ int rd_recv_msg(struct rd *rd, mnl_cb_t callback, void 
*data, uint32_t seq);
 void rd_prepare_msg(struct rd *rd, uint32_t cmd, uint32_t *seq, uint16_t 
flags);
 int rd_dev_init_cb(const struct nlmsghdr *nlh, void *data);
 int rd_attr_cb(const struct nlattr *attr, void *data);
+int rd_attr_check(const struct nlattr *attr, int *typep);
+
+/*
+ * Print helpers
+ */
+void print_driver_table(struct rd *rd, struct nlattr *tb);
+void newline(struct rd *rd);
+void newline_indent(struct rd *rd);
+#define MAX_LINE_LENGTH 80
+
 #endif /* _RDMA_TOOL_H_ */
diff --git a/rdma/res.c b/rdma/res.c
index 1a0aab6..074b992 100644
--- a/rdma/res.c
+++ b/rdma/res.c
@@ -439,10 +439,8 @@ static int res_qp_parse_cb(const struct nlmsghdr *nlh, 
void *data)
if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
free(comm);
 
-   if (rd->json_output)
-   jsonw_end_array(rd->jw);
-   else
-   pr_out("\n");
+   print_driver_table(rd, nla_line[RDMA_NLDEV_ATTR_DRIVER]);
+   newline(rd);
}
return MNL_CB_OK;
 }
@@ -678,10 +676,8 @@ static int res_cm_id_parse_cb(const struct nlmsghdr *nlh, 
void *data)
if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
free(comm);
 
-   if (rd->json_output)
-   jsonw_end_array(rd->jw);
-   else
-   pr_out("\n");
+   print_driver_table(rd, nla_line[RDMA_NLDEV_ATTR_DRIVER]);
+   newline(rd);
}
return MNL_CB_OK;
 }
@@ -804,10 +800,8 @@ static int res_cq_parse_cb(const struct nlmsghdr *nlh, 
void *data)
if (nla_line[RDMA_NLDEV_ATTR_RES_PID])
free(comm);
 
-   if (rd->json_output)
-   jsonw_end_array(rd->jw);
-   else
- 

[PATCH v1 iproute2-next 0/3] RDMA tool driver-specific resource tracking

2018-05-07 Thread Steve Wise
Hello,

This series enhances the iproute2 rdma tool to include displaying
driver-specific resource attributes.  It is the user-space part of the
kernel driver resource tracking series that has been accepted for merging
into linux-4.18 [1]

If there are no additional review comments, it can now be merged, I think.

Changes since v0/rfc:
- changed "provider" to "driver" based on kernel side changes
- updated man pages
- removed "RFC" tag

Thanks,

Steve.

[1] https://www.spinics.net/lists/linux-rdma/msg64199.html

Steve Wise (3):
  rdma: update rdma_netlink.h to get driver attrs
  rdma: print driver resource attributes
  rdma: update man pages

 man/man8/rdma-resource.8  |  29 -
 man/man8/rdma.8   |   2 +-
 rdma/include/uapi/rdma/rdma_netlink.h |  37 ++-
 rdma/rdma.c   |   7 +-
 rdma/rdma.h   |  11 ++
 rdma/res.c|  30 ++
 rdma/utils.c  | 194 ++
 7 files changed, 284 insertions(+), 26 deletions(-)

-- 
1.8.3.1



[PATCH v1 iproute2-next 1/3] rdma: update rdma_netlink.h to get driver attrs

2018-05-07 Thread Steve Wise
Signed-off-by: Steve Wise 
---
 rdma/include/uapi/rdma/rdma_netlink.h | 37 ++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/rdma/include/uapi/rdma/rdma_netlink.h 
b/rdma/include/uapi/rdma/rdma_netlink.h
index 45474f1..40be0d8 100644
--- a/rdma/include/uapi/rdma/rdma_netlink.h
+++ b/rdma/include/uapi/rdma/rdma_netlink.h
@@ -249,10 +249,22 @@ enum rdma_nldev_command {
RDMA_NLDEV_NUM_OPS
 };
 
+enum {
+   RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16,
+};
+
+enum rdma_nldev_print_type {
+   RDMA_NLDEV_PRINT_TYPE_UNSPEC,
+   RDMA_NLDEV_PRINT_TYPE_HEX,
+};
+
 enum rdma_nldev_attr {
/* don't change the order or add anything between, this is ABI! */
RDMA_NLDEV_ATTR_UNSPEC,
 
+   /* Pad attribute for 64b alignment */
+   RDMA_NLDEV_ATTR_PAD = RDMA_NLDEV_ATTR_UNSPEC,
+
/* Identifier for ib_device */
RDMA_NLDEV_ATTR_DEV_INDEX,  /* u32 */
 
@@ -387,8 +399,31 @@ enum rdma_nldev_attr {
RDMA_NLDEV_ATTR_RES_PD_ENTRY,   /* nested table */
RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */
RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */
+   /*
+* driver-specific attributes.
+*/
+   RDMA_NLDEV_ATTR_DRIVER, /* nested table */
+   RDMA_NLDEV_ATTR_DRIVER_ENTRY,   /* nested table */
+   RDMA_NLDEV_ATTR_DRIVER_STRING,  /* string */
+   /*
+* u8 values from enum rdma_nldev_print_type
+*/
+   RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE,  /* u8 */
+   RDMA_NLDEV_ATTR_DRIVER_S32, /* s32 */
+   RDMA_NLDEV_ATTR_DRIVER_U32, /* u32 */
+   RDMA_NLDEV_ATTR_DRIVER_S64, /* s64 */
+   RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */
 
-   /* Netdev information for relevant protocols, like RoCE and iWARP */
+   /*
+* Provides logical name and index of netdevice which is
+* connected to physical port. This information is relevant
+* for RoCE and iWARP.
+*
+* The netdevices which are associated with containers are
+* supposed to be exported together with GID table once it
+* will be exposed through the netlink. Because the
+* associated netdevices are properties of GIDs.
+*/
RDMA_NLDEV_ATTR_NDEV_INDEX, /* u32 */
RDMA_NLDEV_ATTR_NDEV_NAME,  /* string */
 
-- 
1.8.3.1



  1   2   3   4   5   6   >