Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-15 Thread Jiri Pirko
Tue, Sep 15, 2020 at 02:56:48PM CEST, mo...@nvidia.com wrote:
>
>On 9/15/2020 12:33 AM, Jakub Kicinski wrote:
>> External email: Use caution opening links or attachments
>> 
>> 
>> On Mon, 14 Sep 2020 09:07:48 +0300 Moshe Shemesh wrote:
>> > @@ -3011,12 +3060,41 @@ static int devlink_nl_cmd_reload(struct sk_buff 
>> > *skb, struct genl_info *info)
>> >return PTR_ERR(dest_net);
>> >}
>> > 
>> > - err = devlink_reload(devlink, dest_net, info->extack);
>> > + if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION])
>> > + action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]);
>> > + else
>> > + action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT;
>> > +
>> > + if (action == DEVLINK_RELOAD_ACTION_UNSPEC || action > 
>> > DEVLINK_RELOAD_ACTION_MAX) {
>> > + NL_SET_ERR_MSG_MOD(info->extack, "Invalid reload action");
>> > + return -EINVAL;
>> > + } else if (!devlink_reload_action_is_supported(devlink, action)) {
>> > + NL_SET_ERR_MSG_MOD(info->extack, "Requested reload action is 
>> > not supported");
>> > + return -EOPNOTSUPP;
>> > + }
>> > +
>> > + err = devlink_reload(devlink, dest_net, action, info->extack, 
>> > _performed);
>> > 
>> >if (dest_net)
>> >put_net(dest_net);
>> > 
>> > - return err;
>> > + if (err)
>> > + return err;
>> > +
>> > + WARN_ON(!actions_performed);
>> > + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
>> > + if (!msg)
>> > + return -ENOMEM;
>> > +
>> > + err = devlink_nl_reload_actions_performed_fill(msg, devlink, 
>> > actions_performed,
>> > +DEVLINK_CMD_RELOAD, 
>> > info->snd_portid,
>> > +info->snd_seq, 0);
>> > + if (err) {
>> > + nlmsg_free(msg);
>> > + return err;
>> > + }
>> > +
>> > + return genlmsg_reply(msg, info);
>> I think generating the reply may break existing users. Only generate
>> the reply if request contained DEVLINK_ATTR_RELOAD_ACTION (or any other
>> new attribute which existing users can't pass).
>
>
>OK, I can do that. But I update stats and generate devlink notification
>anyway, that should fine, right ?

Yes.

>


Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-15 Thread Jiri Pirko
Tue, Sep 15, 2020 at 02:12:25PM CEST, mo...@nvidia.com wrote:
>
>On 9/14/2020 3:27 PM, Jiri Pirko wrote:
>> Mon, Sep 14, 2020 at 08:07:48AM CEST, mo...@mellanox.com wrote:

[..]

>> > @@ -7392,6 +7485,11 @@ struct devlink *devlink_alloc(const struct 
>> > devlink_ops *ops, size_t priv_size)
>> >if (!devlink)
>> >return NULL;
>> >devlink->ops = ops;
>> > +  if (devlink_reload_actions_verify(devlink)) {
>> Move this check to the beginning. You don't need devlink instance for
>> the check, just ops.
>
>
>Right, will fix.
>
>> also, your devlink_reload_actions_verify() function returns
>> 0/-ESOMETHING. Treat it accordingly here.
>
>
>Well, yes, but I rather return NULL here since devlink_alloc() failed. If
>devlink_reload_actions_verify() fails it has WARN_ON which will lead the
>driver developer to his bug.

So let the verify() return bool.
My point is, if a function return 0/-ESOMETHING, you should not check
the return value directly but you should use int err/ret.

>
>> 
>> > +  kfree(devlink);
>> > +  return NULL;
>> > +  }

[...]


Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-15 Thread Jakub Kicinski
On Tue, 15 Sep 2020 15:56:48 +0300 Moshe Shemesh wrote:
> On 9/15/2020 12:33 AM, Jakub Kicinski wrote:
> >> + if (err)
> >> + return err;
> >> +
> >> + WARN_ON(!actions_performed);
> >> + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
> >> + if (!msg)
> >> + return -ENOMEM;
> >> +
> >> + err = devlink_nl_reload_actions_performed_fill(msg, devlink, 
> >> actions_performed,
> >> +DEVLINK_CMD_RELOAD, 
> >> info->snd_portid,
> >> +info->snd_seq, 0);
> >> + if (err) {
> >> + nlmsg_free(msg);
> >> + return err;
> >> + }
> >> +
> >> + return genlmsg_reply(msg, info);  
> > I think generating the reply may break existing users. Only generate
> > the reply if request contained DEVLINK_ATTR_RELOAD_ACTION (or any other
> > new attribute which existing users can't pass).  
> 
> OK, I can do that. But I update stats and generate devlink notification 
> anyway, that should fine, right ?

Yes, that should be fine.



Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-15 Thread Moshe Shemesh



On 9/15/2020 4:26 PM, Jiri Pirko wrote:

Tue, Sep 15, 2020 at 02:12:25PM CEST, mo...@nvidia.com wrote:

On 9/14/2020 3:27 PM, Jiri Pirko wrote:

Mon, Sep 14, 2020 at 08:07:48AM CEST, mo...@mellanox.com wrote:

[..]


@@ -7392,6 +7485,11 @@ struct devlink *devlink_alloc(const struct devlink_ops 
*ops, size_t priv_size)
if (!devlink)
return NULL;
devlink->ops = ops;
+   if (devlink_reload_actions_verify(devlink)) {

Move this check to the beginning. You don't need devlink instance for
the check, just ops.


Right, will fix.


also, your devlink_reload_actions_verify() function returns
0/-ESOMETHING. Treat it accordingly here.


Well, yes, but I rather return NULL here since devlink_alloc() failed. If
devlink_reload_actions_verify() fails it has WARN_ON which will lead the
driver developer to his bug.

So let the verify() return bool.
My point is, if a function return 0/-ESOMETHING, you should not check
the return value directly but you should use int err/ret.



OK, will fix.


+   kfree(devlink);
+   return NULL;
+   }

[...]


Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-15 Thread Moshe Shemesh



On 9/15/2020 12:33 AM, Jakub Kicinski wrote:

External email: Use caution opening links or attachments


On Mon, 14 Sep 2020 09:07:48 +0300 Moshe Shemesh wrote:

@@ -3011,12 +3060,41 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, 
struct genl_info *info)
   return PTR_ERR(dest_net);
   }

- err = devlink_reload(devlink, dest_net, info->extack);
+ if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION])
+ action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]);
+ else
+ action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT;
+
+ if (action == DEVLINK_RELOAD_ACTION_UNSPEC || action > 
DEVLINK_RELOAD_ACTION_MAX) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Invalid reload action");
+ return -EINVAL;
+ } else if (!devlink_reload_action_is_supported(devlink, action)) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Requested reload action is not 
supported");
+ return -EOPNOTSUPP;
+ }
+
+ err = devlink_reload(devlink, dest_net, action, info->extack, 
_performed);

   if (dest_net)
   put_net(dest_net);

- return err;
+ if (err)
+ return err;
+
+ WARN_ON(!actions_performed);
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_reload_actions_performed_fill(msg, devlink, 
actions_performed,
+DEVLINK_CMD_RELOAD, 
info->snd_portid,
+info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);

I think generating the reply may break existing users. Only generate
the reply if request contained DEVLINK_ATTR_RELOAD_ACTION (or any other
new attribute which existing users can't pass).



OK, I can do that. But I update stats and generate devlink notification 
anyway, that should fine, right ?




Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-15 Thread Moshe Shemesh



On 9/14/2020 3:27 PM, Jiri Pirko wrote:

Mon, Sep 14, 2020 at 08:07:48AM CEST, mo...@mellanox.com wrote:

Add devlink reload action to allow the user to request a specific reload
action. The action parameter is optional, if not specified then devlink
driver re-init action is used (backward compatible).
Note that when required to do firmware activation some drivers may need
to reload the driver. On the other hand some drivers may need to reset
the firmware to reinitialize the driver entities. Therefore, the devlink
reload command returns the actions which were actually performed.
Reload actions supported are:
driver_reinit: driver entities re-initialization, applying devlink-param
   and devlink-resource values.
fw_activate: firmware activate.

command examples:
$devlink dev reload pci/:82:00.0 action driver_reinit
reload_actions_performed:
  driver_reinit

$devlink dev reload pci/:82:00.0 action fw_activate
reload_actions_performed:
  driver_reinit fw_activate

Signed-off-by: Moshe Shemesh 
---
v3 -> v4:
- Removed fw_activate_no_reset as an action (next patch adds limit
  levels instead).
- Renamed actions_done to actions_performed
v2 -> v3:
- Replace fw_live_patch action by fw_activate_no_reset
- Devlink reload returns the actions done over netlink reply
v1 -> v2:
- Instead of reload levels driver,fw_reset,fw_live_patch have reload
  actions driver_reinit,fw_activate,fw_live_patch
- Remove driver default level, the action driver_reinit is the default
  action for all drivers
---
drivers/net/ethernet/mellanox/mlx4/main.c |  14 ++-
.../net/ethernet/mellanox/mlx5/core/devlink.c |  15 ++-
drivers/net/ethernet/mellanox/mlxsw/core.c|  25 ++--
drivers/net/netdevsim/dev.c   |  16 ++-
include/net/devlink.h |   7 +-
include/uapi/linux/devlink.h  |  19 +++
net/core/devlink.c| 111 +-
7 files changed, 180 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
b/drivers/net/ethernet/mellanox/mlx4/main.c
index 70cf24ba71e4..aadf1676a0ed 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3946,6 +3946,7 @@ static int mlx4_restart_one_up(struct pci_dev *pdev, bool 
reload,
   struct devlink *devlink);

static int mlx4_devlink_reload_down(struct devlink *devlink, bool netns_change,
+   enum devlink_reload_action action,
struct netlink_ext_ack *extack)
{
struct mlx4_priv *priv = devlink_priv(devlink);
@@ -3962,8 +3963,8 @@ static int mlx4_devlink_reload_down(struct devlink 
*devlink, bool netns_change,
return 0;
}

-static int mlx4_devlink_reload_up(struct devlink *devlink,
- struct netlink_ext_ack *extack)
+static int mlx4_devlink_reload_up(struct devlink *devlink, enum 
devlink_reload_action action,
+ struct netlink_ext_ack *extack, unsigned long 
*actions_performed)
{
struct mlx4_priv *priv = devlink_priv(devlink);
struct mlx4_dev *dev = >dev;
@@ -3971,15 +3972,20 @@ static int mlx4_devlink_reload_up(struct devlink 
*devlink,
int err;

err = mlx4_restart_one_up(persist->pdev, true, devlink);
-   if (err)
+   if (err) {
mlx4_err(persist->dev, "mlx4_restart_one_up failed, ret=%d\n",
 err);
+   return err;
+   }
+   if (actions_performed)

Nit, pass the unsigned long allways (even when it would be unused) and
avoid check in every driver.



Ack.


+   *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);

-   return err;
+   return 0;
}

static const struct devlink_ops mlx4_devlink_ops = {
.port_type_set  = mlx4_devlink_port_type_set,
+   .supported_reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT),
.reload_down= mlx4_devlink_reload_down,
.reload_up  = mlx4_devlink_reload_up,
};

[..]



@@ -2969,29 +2975,72 @@ bool devlink_is_reload_failed(const struct devlink 
*devlink)
EXPORT_SYMBOL_GPL(devlink_is_reload_failed);

static int devlink_reload(struct devlink *devlink, struct net *dest_net,
- struct netlink_ext_ack *extack)
+ enum devlink_reload_action action, struct 
netlink_ext_ack *extack,
+ unsigned long *actions_performed)
{
int err;

if (!devlink->reload_enabled)
return -EOPNOTSUPP;

-   err = devlink->ops->reload_down(devlink, !!dest_net, extack);
+   err = devlink->ops->reload_down(devlink, !!dest_net, action, extack);
if (err)
return err;

if (dest_net && !net_eq(dest_net, devlink_net(devlink)))
devlink_reload_netns_change(devlink, dest_net);

-   err = devlink->ops->reload_up(devlink, extack);
+   

Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-15 Thread Jiri Pirko
Tue, Sep 15, 2020 at 12:06:19AM CEST, michael.c...@broadcom.com wrote:
>On Mon, Sep 14, 2020 at 2:31 PM Jakub Kicinski  wrote:
>>
>> On Mon, 14 Sep 2020 13:28:29 +0200 Jiri Pirko wrote:
>> > >> Instead, why don't you block in reload_up() until the reset is complete?
>> > >
>> > >Though user initiate "devlink dev reload" event on a single interface,
>> > >all driver entities undergo reset and all entities recover
>> > >independently. I don't think we can block the reload_up() on the
>> > >interface(that user initiated the command), until whole reset is
>> > >complete.
>> >
>> > Why not? mlxsw reset takes up to like 10 seconds for example.
>>
>> +1, why?
>
>Yes, we should be able to block until the reset sequence is complete.
>I don't see any problem.  I will work with Vasundhara on this.

Could you please also remove fw_reset as it is apparently misuse of
devlink health mechanism?

Thanks!


Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-14 Thread Michael Chan
On Mon, Sep 14, 2020 at 2:31 PM Jakub Kicinski  wrote:
>
> On Mon, 14 Sep 2020 13:28:29 +0200 Jiri Pirko wrote:
> > >> Instead, why don't you block in reload_up() until the reset is complete?
> > >
> > >Though user initiate "devlink dev reload" event on a single interface,
> > >all driver entities undergo reset and all entities recover
> > >independently. I don't think we can block the reload_up() on the
> > >interface(that user initiated the command), until whole reset is
> > >complete.
> >
> > Why not? mlxsw reset takes up to like 10 seconds for example.
>
> +1, why?

Yes, we should be able to block until the reset sequence is complete.
I don't see any problem.  I will work with Vasundhara on this.


Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-14 Thread Jakub Kicinski
On Mon, 14 Sep 2020 09:07:48 +0300 Moshe Shemesh wrote:
> @@ -3011,12 +3060,41 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, 
> struct genl_info *info)
>   return PTR_ERR(dest_net);
>   }
>  
> - err = devlink_reload(devlink, dest_net, info->extack);
> + if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION])
> + action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]);
> + else
> + action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT;
> +
> + if (action == DEVLINK_RELOAD_ACTION_UNSPEC || action > 
> DEVLINK_RELOAD_ACTION_MAX) {
> + NL_SET_ERR_MSG_MOD(info->extack, "Invalid reload action");
> + return -EINVAL;
> + } else if (!devlink_reload_action_is_supported(devlink, action)) {
> + NL_SET_ERR_MSG_MOD(info->extack, "Requested reload action is 
> not supported");
> + return -EOPNOTSUPP;
> + }
> +
> + err = devlink_reload(devlink, dest_net, action, info->extack, 
> _performed);
>  
>   if (dest_net)
>   put_net(dest_net);
>  
> - return err;
> + if (err)
> + return err;
> +
> + WARN_ON(!actions_performed);
> + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
> + if (!msg)
> + return -ENOMEM;
> +
> + err = devlink_nl_reload_actions_performed_fill(msg, devlink, 
> actions_performed,
> +DEVLINK_CMD_RELOAD, 
> info->snd_portid,
> +info->snd_seq, 0);
> + if (err) {
> + nlmsg_free(msg);
> + return err;
> + }
> +
> + return genlmsg_reply(msg, info);

I think generating the reply may break existing users. Only generate
the reply if request contained DEVLINK_ATTR_RELOAD_ACTION (or any other
new attribute which existing users can't pass).


Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-14 Thread Jakub Kicinski
On Mon, 14 Sep 2020 13:28:29 +0200 Jiri Pirko wrote:
> Mon, Sep 14, 2020 at 11:54:55AM CEST, vasundhara-v.vo...@broadcom.com wrote:
> >On Mon, Sep 14, 2020 at 3:02 PM Jiri Pirko  wrote:  
> >> >> +mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink, enum 
> >> >> devlink_reload_action action,
> >> >> +   struct netlink_ext_ack *extack,
> >> >> +   unsigned long 
> >> >> *actions_performed)  
> >> >Sorry for repeating again, for fw_activate action on our device, all
> >> >the driver entities undergo reset asynchronously once user initiates
> >> >"devlink dev reload action fw_activate" and reload_up does not have
> >> >much to do except reporting actions that will be/being performed.
> >> >
> >> >Once reset is complete, the health reporter will be notified using  
> >>
> >> Hmm, how is the fw reset related to health reporter recovery? Recovery
> >> happens after some error event. I don't believe it is wise to mix it.  
> >Our device has a fw_reset health reporter, which is updated on reset
> >events and firmware activation is one among them. All non-fatal
> >firmware reset events are reported on fw_reset health reporter.  
> 
> Hmm, interesting. In that case, assuming this is fine, should we have
> some standard in this. I mean, if the driver supports reset, should it
> also define the "fw_reset" reporter to report such events?
> 
> Jakub, what is your take here?

Sounds doubly wrong to me.

As you say health reporters should trigger on error events,
communicating completion of an action requested by the user
seems very wrong. IIUC operators should monitor and collect
health failures. In this case looks like all events from fw_reset 
would need to be discarded, since they are not meaningful
without the context of what triggered them.

And secondly, reporting the completion via some async mechanism
that user has to monitor is just plain lazy. That's pushing out
the work that has to be done out to user space. Wait for the 
completion in the driver.

> >> Instead, why don't you block in reload_up() until the reset is complete?  
> >
> >Though user initiate "devlink dev reload" event on a single interface,
> >all driver entities undergo reset and all entities recover
> >independently. I don't think we can block the reload_up() on the
> >interface(that user initiated the command), until whole reset is
> >complete.  
> 
> Why not? mlxsw reset takes up to like 10 seconds for example.

+1, why?


Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-14 Thread Jiri Pirko
Mon, Sep 14, 2020 at 08:07:48AM CEST, mo...@mellanox.com wrote:
>Add devlink reload action to allow the user to request a specific reload
>action. The action parameter is optional, if not specified then devlink
>driver re-init action is used (backward compatible).
>Note that when required to do firmware activation some drivers may need
>to reload the driver. On the other hand some drivers may need to reset
>the firmware to reinitialize the driver entities. Therefore, the devlink
>reload command returns the actions which were actually performed.
>Reload actions supported are:
>driver_reinit: driver entities re-initialization, applying devlink-param
>   and devlink-resource values.
>fw_activate: firmware activate.
>
>command examples:
>$devlink dev reload pci/:82:00.0 action driver_reinit
>reload_actions_performed:
>  driver_reinit
>
>$devlink dev reload pci/:82:00.0 action fw_activate
>reload_actions_performed:
>  driver_reinit fw_activate
>
>Signed-off-by: Moshe Shemesh 
>---
>v3 -> v4:
>- Removed fw_activate_no_reset as an action (next patch adds limit
>  levels instead).
>- Renamed actions_done to actions_performed
>v2 -> v3:
>- Replace fw_live_patch action by fw_activate_no_reset
>- Devlink reload returns the actions done over netlink reply
>v1 -> v2:
>- Instead of reload levels driver,fw_reset,fw_live_patch have reload
>  actions driver_reinit,fw_activate,fw_live_patch
>- Remove driver default level, the action driver_reinit is the default
>  action for all drivers
>---
> drivers/net/ethernet/mellanox/mlx4/main.c |  14 ++-
> .../net/ethernet/mellanox/mlx5/core/devlink.c |  15 ++-
> drivers/net/ethernet/mellanox/mlxsw/core.c|  25 ++--
> drivers/net/netdevsim/dev.c   |  16 ++-
> include/net/devlink.h |   7 +-
> include/uapi/linux/devlink.h  |  19 +++
> net/core/devlink.c| 111 +-
> 7 files changed, 180 insertions(+), 27 deletions(-)
>
>diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
>b/drivers/net/ethernet/mellanox/mlx4/main.c
>index 70cf24ba71e4..aadf1676a0ed 100644
>--- a/drivers/net/ethernet/mellanox/mlx4/main.c
>+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
>@@ -3946,6 +3946,7 @@ static int mlx4_restart_one_up(struct pci_dev *pdev, 
>bool reload,
>  struct devlink *devlink);
> 
> static int mlx4_devlink_reload_down(struct devlink *devlink, bool 
> netns_change,
>+  enum devlink_reload_action action,
>   struct netlink_ext_ack *extack)
> {
>   struct mlx4_priv *priv = devlink_priv(devlink);
>@@ -3962,8 +3963,8 @@ static int mlx4_devlink_reload_down(struct devlink 
>*devlink, bool netns_change,
>   return 0;
> }
> 
>-static int mlx4_devlink_reload_up(struct devlink *devlink,
>-struct netlink_ext_ack *extack)
>+static int mlx4_devlink_reload_up(struct devlink *devlink, enum 
>devlink_reload_action action,
>+struct netlink_ext_ack *extack, unsigned long 
>*actions_performed)
> {
>   struct mlx4_priv *priv = devlink_priv(devlink);
>   struct mlx4_dev *dev = >dev;
>@@ -3971,15 +3972,20 @@ static int mlx4_devlink_reload_up(struct devlink 
>*devlink,
>   int err;
> 
>   err = mlx4_restart_one_up(persist->pdev, true, devlink);
>-  if (err)
>+  if (err) {
>   mlx4_err(persist->dev, "mlx4_restart_one_up failed, ret=%d\n",
>err);
>+  return err;
>+  }
>+  if (actions_performed)

Nit, pass the unsigned long allways (even when it would be unused) and
avoid check in every driver.


>+  *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
> 
>-  return err;
>+  return 0;
> }
> 
> static const struct devlink_ops mlx4_devlink_ops = {
>   .port_type_set  = mlx4_devlink_port_type_set,
>+  .supported_reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT),
>   .reload_down= mlx4_devlink_reload_down,
>   .reload_up  = mlx4_devlink_reload_up,
> };

[..]


>@@ -2969,29 +2975,72 @@ bool devlink_is_reload_failed(const struct devlink 
>*devlink)
> EXPORT_SYMBOL_GPL(devlink_is_reload_failed);
> 
> static int devlink_reload(struct devlink *devlink, struct net *dest_net,
>-struct netlink_ext_ack *extack)
>+enum devlink_reload_action action, struct 
>netlink_ext_ack *extack,
>+unsigned long *actions_performed)
> {
>   int err;
> 
>   if (!devlink->reload_enabled)
>   return -EOPNOTSUPP;
> 
>-  err = devlink->ops->reload_down(devlink, !!dest_net, extack);
>+  err = devlink->ops->reload_down(devlink, !!dest_net, action, extack);
>   if (err)
>   return err;
> 
>   if (dest_net && !net_eq(dest_net, devlink_net(devlink)))
>   devlink_reload_netns_change(devlink, dest_net);
> 

Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-14 Thread Jiri Pirko
Mon, Sep 14, 2020 at 11:54:55AM CEST, vasundhara-v.vo...@broadcom.com wrote:
>On Mon, Sep 14, 2020 at 3:02 PM Jiri Pirko  wrote:
>>
>> Mon, Sep 14, 2020 at 09:08:58AM CEST, vasundhara-v.vo...@broadcom.com wrote:
>> >On Mon, Sep 14, 2020 at 11:39 AM Moshe Shemesh  wrote:
>>
>> [...]
>>
>>
>> >> @@ -1126,15 +1126,24 @@ mlxsw_devlink_core_bus_device_reload_down(struct 
>> >> devlink *devlink,
>> >>  }
>> >>
>> >>  static int
>> >> -mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink,
>> >> -   struct netlink_ext_ack *extack)
>> >> +mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink, enum 
>> >> devlink_reload_action action,
>> >> +   struct netlink_ext_ack *extack,
>> >> +   unsigned long *actions_performed)
>> >Sorry for repeating again, for fw_activate action on our device, all
>> >the driver entities undergo reset asynchronously once user initiates
>> >"devlink dev reload action fw_activate" and reload_up does not have
>> >much to do except reporting actions that will be/being performed.
>> >
>> >Once reset is complete, the health reporter will be notified using
>>
>> Hmm, how is the fw reset related to health reporter recovery? Recovery
>> happens after some error event. I don't believe it is wise to mix it.
>Our device has a fw_reset health reporter, which is updated on reset
>events and firmware activation is one among them. All non-fatal
>firmware reset events are reported on fw_reset health reporter.

Hmm, interesting. In that case, assuming this is fine, should we have
some standard in this. I mean, if the driver supports reset, should it
also define the "fw_reset" reporter to report such events?

Jakub, what is your take here?


>
>>
>> Instead, why don't you block in reload_up() until the reset is complete?
>
>Though user initiate "devlink dev reload" event on a single interface,
>all driver entities undergo reset and all entities recover
>independently. I don't think we can block the reload_up() on the
>interface(that user initiated the command), until whole reset is
>complete.

Why not? mlxsw reset takes up to like 10 seconds for example.


>>
>>
>> >devlink_health_reporter_recovery_done(). Returning from reload_up does
>> >not guarantee successful activation of firmware. Status of reset will
>> >be notified to the health reporter via
>> >devlink_health_reporter_state_update().
>> >
>> >I am just repeating this, so I want to know if I am on the same page.
>> >
>> >Thanks.
>>
>> [...]


Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-14 Thread Vasundhara Volam
On Mon, Sep 14, 2020 at 3:02 PM Jiri Pirko  wrote:
>
> Mon, Sep 14, 2020 at 09:08:58AM CEST, vasundhara-v.vo...@broadcom.com wrote:
> >On Mon, Sep 14, 2020 at 11:39 AM Moshe Shemesh  wrote:
>
> [...]
>
>
> >> @@ -1126,15 +1126,24 @@ mlxsw_devlink_core_bus_device_reload_down(struct 
> >> devlink *devlink,
> >>  }
> >>
> >>  static int
> >> -mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink,
> >> -   struct netlink_ext_ack *extack)
> >> +mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink, enum 
> >> devlink_reload_action action,
> >> +   struct netlink_ext_ack *extack,
> >> +   unsigned long *actions_performed)
> >Sorry for repeating again, for fw_activate action on our device, all
> >the driver entities undergo reset asynchronously once user initiates
> >"devlink dev reload action fw_activate" and reload_up does not have
> >much to do except reporting actions that will be/being performed.
> >
> >Once reset is complete, the health reporter will be notified using
>
> Hmm, how is the fw reset related to health reporter recovery? Recovery
> happens after some error event. I don't believe it is wise to mix it.
Our device has a fw_reset health reporter, which is updated on reset
events and firmware activation is one among them. All non-fatal
firmware reset events are reported on fw_reset health reporter.

>
> Instead, why don't you block in reload_up() until the reset is complete?

Though user initiate "devlink dev reload" event on a single interface,
all driver entities undergo reset and all entities recover
independently. I don't think we can block the reload_up() on the
interface(that user initiated the command), until whole reset is
complete.
>
>
> >devlink_health_reporter_recovery_done(). Returning from reload_up does
> >not guarantee successful activation of firmware. Status of reset will
> >be notified to the health reporter via
> >devlink_health_reporter_state_update().
> >
> >I am just repeating this, so I want to know if I am on the same page.
> >
> >Thanks.
>
> [...]


Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-14 Thread Jiri Pirko
Mon, Sep 14, 2020 at 09:08:58AM CEST, vasundhara-v.vo...@broadcom.com wrote:
>On Mon, Sep 14, 2020 at 11:39 AM Moshe Shemesh  wrote:

[...]


>> @@ -1126,15 +1126,24 @@ mlxsw_devlink_core_bus_device_reload_down(struct 
>> devlink *devlink,
>>  }
>>
>>  static int
>> -mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink,
>> -   struct netlink_ext_ack *extack)
>> +mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink, enum 
>> devlink_reload_action action,
>> +   struct netlink_ext_ack *extack,
>> +   unsigned long *actions_performed)
>Sorry for repeating again, for fw_activate action on our device, all
>the driver entities undergo reset asynchronously once user initiates
>"devlink dev reload action fw_activate" and reload_up does not have
>much to do except reporting actions that will be/being performed.
>
>Once reset is complete, the health reporter will be notified using

Hmm, how is the fw reset related to health reporter recovery? Recovery
happens after some error event. I don't believe it is wise to mix it.

Instead, why don't you block in reload_up() until the reset is complete?


>devlink_health_reporter_recovery_done(). Returning from reload_up does
>not guarantee successful activation of firmware. Status of reset will
>be notified to the health reporter via
>devlink_health_reporter_state_update().
>
>I am just repeating this, so I want to know if I am on the same page.
>
>Thanks.

[...]


Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-14 Thread Vasundhara Volam
On Mon, Sep 14, 2020 at 11:39 AM Moshe Shemesh  wrote:
>
> Add devlink reload action to allow the user to request a specific reload
> action. The action parameter is optional, if not specified then devlink
> driver re-init action is used (backward compatible).
> Note that when required to do firmware activation some drivers may need
> to reload the driver. On the other hand some drivers may need to reset
> the firmware to reinitialize the driver entities. Therefore, the devlink
> reload command returns the actions which were actually performed.
> Reload actions supported are:
> driver_reinit: driver entities re-initialization, applying devlink-param
>and devlink-resource values.
> fw_activate: firmware activate.
>
> command examples:
> $devlink dev reload pci/:82:00.0 action driver_reinit
> reload_actions_performed:
>   driver_reinit
>
> $devlink dev reload pci/:82:00.0 action fw_activate
> reload_actions_performed:
>   driver_reinit fw_activate
>
> Signed-off-by: Moshe Shemesh 
> ---
> v3 -> v4:
> - Removed fw_activate_no_reset as an action (next patch adds limit
>   levels instead).
> - Renamed actions_done to actions_performed
> v2 -> v3:
> - Replace fw_live_patch action by fw_activate_no_reset
> - Devlink reload returns the actions done over netlink reply
> v1 -> v2:
> - Instead of reload levels driver,fw_reset,fw_live_patch have reload
>   actions driver_reinit,fw_activate,fw_live_patch
> - Remove driver default level, the action driver_reinit is the default
>   action for all drivers
> ---
>  drivers/net/ethernet/mellanox/mlx4/main.c |  14 ++-
>  .../net/ethernet/mellanox/mlx5/core/devlink.c |  15 ++-
>  drivers/net/ethernet/mellanox/mlxsw/core.c|  25 ++--
>  drivers/net/netdevsim/dev.c   |  16 ++-
>  include/net/devlink.h |   7 +-
>  include/uapi/linux/devlink.h  |  19 +++
>  net/core/devlink.c| 111 +-
>  7 files changed, 180 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
> b/drivers/net/ethernet/mellanox/mlx4/main.c
> index 70cf24ba71e4..aadf1676a0ed 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/main.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/main.c
> @@ -3946,6 +3946,7 @@ static int mlx4_restart_one_up(struct pci_dev *pdev, 
> bool reload,
>struct devlink *devlink);
>
>  static int mlx4_devlink_reload_down(struct devlink *devlink, bool 
> netns_change,
> +   enum devlink_reload_action action,
> struct netlink_ext_ack *extack)
>  {
> struct mlx4_priv *priv = devlink_priv(devlink);
> @@ -3962,8 +3963,8 @@ static int mlx4_devlink_reload_down(struct devlink 
> *devlink, bool netns_change,
> return 0;
>  }
>
> -static int mlx4_devlink_reload_up(struct devlink *devlink,
> - struct netlink_ext_ack *extack)
> +static int mlx4_devlink_reload_up(struct devlink *devlink, enum 
> devlink_reload_action action,
> + struct netlink_ext_ack *extack, unsigned 
> long *actions_performed)
>  {
> struct mlx4_priv *priv = devlink_priv(devlink);
> struct mlx4_dev *dev = >dev;
> @@ -3971,15 +3972,20 @@ static int mlx4_devlink_reload_up(struct devlink 
> *devlink,
> int err;
>
> err = mlx4_restart_one_up(persist->pdev, true, devlink);
> -   if (err)
> +   if (err) {
> mlx4_err(persist->dev, "mlx4_restart_one_up failed, ret=%d\n",
>  err);
> +   return err;
> +   }
> +   if (actions_performed)
> +   *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
>
> -   return err;
> +   return 0;
>  }
>
>  static const struct devlink_ops mlx4_devlink_ops = {
> .port_type_set  = mlx4_devlink_port_type_set,
> +   .supported_reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT),
> .reload_down= mlx4_devlink_reload_down,
> .reload_up  = mlx4_devlink_reload_up,
>  };
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
> index c709e9a385f6..9cd6b6c884e3 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
> @@ -89,6 +89,7 @@ mlx5_devlink_info_get(struct devlink *devlink, struct 
> devlink_info_req *req,
>  }
>
>  static int mlx5_devlink_reload_down(struct devlink *devlink, bool 
> netns_change,
> +   enum devlink_reload_action action,
> struct netlink_ext_ack *extack)
>  {
> struct mlx5_core_dev *dev = devlink_priv(devlink);
> @@ -97,12 +98,19 @@ static int mlx5_devlink_reload_down(struct devlink 
> *devlink, bool netns_change,
> return 0;
>  }
>
> -static int mlx5_devlink_reload_up(struct devlink