Re: [PATCH net-next v2 03/16] devlink: Add devlink reload limit option

2020-10-07 Thread Vasundhara Volam
On Wed, Oct 7, 2020 at 11:32 AM Moshe Shemesh  wrote:
>
> Add reload limit to demand restrictions on reload actions.
> Reload limits supported:
> no_reset: No reset allowed, no down time allowed, no link flap and no
>   configuration is lost.
>
> By default reload limit is unspecified and so no constraints on reload
> actions are required.
>
> Some combinations of action and limit are invalid. For example, driver
> can not reinitialize its entities without any downtime.
>
> The no_reset reload limit will have usecase in this patchset to
> implement restricted fw_activate on mlx5.
>
> Have the uapi parameter of reload limit ready for future support of
> multiselection.
>
> Signed-off-by: Moshe Shemesh 
> ---
> v1 -> v2:
> - Changed limit uapi parameter to bitfield32 for future support of
>   multiselection
> - Fixed reverse xmas tree
> RFCv5 -> v1:
> - Renamed supported_reload_actions_limit_levels to reload_limits
> - Renamed reload_action_limit_level to reload_limit
> - Change RELOAD_LIMIT_NONE to unspecified RELOAD_LIMIT_UNSPEC,
>   drivers don't need to declare support limits if they support only
>   no limitation
> - Use nla_poilcy range validation and remove the range check in
> devlink_nl_cmd_reload
> RFCv4 -> RFCv5:
> - Remove check DEVLINK_RELOAD_ACTION_LIMIT_LEVEL_MAX
> - Added list of invalid action-limit_level combinations and add check to
>   supported actions and levels and check user request
> RFCv3 -> RFCv4:
> - New patch
> ---
>  drivers/net/ethernet/mellanox/mlx4/main.c |  4 +-
>  .../net/ethernet/mellanox/mlx5/core/devlink.c |  4 +-
>  drivers/net/ethernet/mellanox/mlxsw/core.c|  4 +-
>  drivers/net/netdevsim/dev.c   |  6 +-
>  include/net/devlink.h |  8 +-
>  include/uapi/linux/devlink.h  | 14 +++
>  net/core/devlink.c| 92 +--
>  7 files changed, 119 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
> b/drivers/net/ethernet/mellanox/mlx4/main.c
> index 649c5323cf9f..c326b434734e 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/main.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/main.c
> @@ -3947,6 +3947,7 @@ static int mlx4_restart_one_up(struct pci_dev *pdev, 
> bool reload,
>
>  static int mlx4_devlink_reload_down(struct devlink *devlink, bool 
> netns_change,
> enum devlink_reload_action action,
> +   enum devlink_reload_limit limit,
> struct netlink_ext_ack *extack)
>  {
> struct mlx4_priv *priv = devlink_priv(devlink);
I don't see any check for limit. If users provide a limit that is not
supported by the driver, it seems to be simply ignored. Is it checked
somewhere else?


> @@ -3964,7 +3965,8 @@ static int mlx4_devlink_reload_down(struct devlink 
> *devlink, bool netns_change,
>  }
>
>  static int mlx4_devlink_reload_up(struct devlink *devlink, enum 
> devlink_reload_action action,
> - u32 *actions_performed, struct 
> netlink_ext_ack *extack)
> + enum devlink_reload_limit limit, u32 
> *actions_performed,
> + struct netlink_ext_ack *extack)
>  {
> struct mlx4_priv *priv = devlink_priv(devlink);
> struct mlx4_dev *dev = >dev;
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
> index 1b248c01a209..0016041e8779 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
> @@ -86,6 +86,7 @@ mlx5_devlink_info_get(struct devlink *devlink, struct 
> devlink_info_req *req,
>
>  static int mlx5_devlink_reload_down(struct devlink *devlink, bool 
> netns_change,
> enum devlink_reload_action action,
> +   enum devlink_reload_limit limit,
> struct netlink_ext_ack *extack)
>  {
> struct mlx5_core_dev *dev = devlink_priv(devlink);
> @@ -95,7 +96,8 @@ static int mlx5_devlink_reload_down(struct devlink 
> *devlink, bool netns_change,
>  }
>
>  static int mlx5_devlink_reload_up(struct devlink *devlink, enum 
> devlink_reload_action action,
> - u32 *actions_performed, struct 
> netlink_ext_ack *extack)
> + enum devlink_reload_limit limit, u32 
> *actions_performed,
> + struct netlink_ext_ack *extack)
>  {
> struct mlx5_core_dev *dev = devlink_priv(devlink);
>
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c 
> b/drivers/net/ethernet/mellanox/mlxsw/core.c
> index cd9f56c73827..7f77c2a71d1c 100644
> --- a/drivers/net/ethernet/mellanox/mlxsw/core.c
> +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
> @@ -1415,6 +1415,7 @@ mlxsw_devlink_info_get(struct devlink *devlink, struct 

Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-14 Thread Vasundhara Volam
On Mon, Sep 14, 2020 at 3:02 PM Jiri Pirko  wrote:
>
> Mon, Sep 14, 2020 at 09:08:58AM CEST, vasundhara-v.vo...@broadcom.com wrote:
> >On Mon, Sep 14, 2020 at 11:39 AM Moshe Shemesh  wrote:
>
> [...]
>
>
> >> @@ -1126,15 +1126,24 @@ mlxsw_devlink_core_bus_device_reload_down(struct 
> >> devlink *devlink,
> >>  }
> >>
> >>  static int
> >> -mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink,
> >> -   struct netlink_ext_ack *extack)
> >> +mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink, enum 
> >> devlink_reload_action action,
> >> +   struct netlink_ext_ack *extack,
> >> +   unsigned long *actions_performed)
> >Sorry for repeating again, for fw_activate action on our device, all
> >the driver entities undergo reset asynchronously once user initiates
> >"devlink dev reload action fw_activate" and reload_up does not have
> >much to do except reporting actions that will be/being performed.
> >
> >Once reset is complete, the health reporter will be notified using
>
> Hmm, how is the fw reset related to health reporter recovery? Recovery
> happens after some error event. I don't believe it is wise to mix it.
Our device has a fw_reset health reporter, which is updated on reset
events and firmware activation is one among them. All non-fatal
firmware reset events are reported on fw_reset health reporter.

>
> Instead, why don't you block in reload_up() until the reset is complete?

Though user initiate "devlink dev reload" event on a single interface,
all driver entities undergo reset and all entities recover
independently. I don't think we can block the reload_up() on the
interface(that user initiated the command), until whole reset is
complete.
>
>
> >devlink_health_reporter_recovery_done(). Returning from reload_up does
> >not guarantee successful activation of firmware. Status of reset will
> >be notified to the health reporter via
> >devlink_health_reporter_state_update().
> >
> >I am just repeating this, so I want to know if I am on the same page.
> >
> >Thanks.
>
> [...]


Re: [PATCH net-next RFC v4 01/15] devlink: Add reload action option to devlink reload command

2020-09-14 Thread Vasundhara Volam
On Mon, Sep 14, 2020 at 11:39 AM Moshe Shemesh  wrote:
>
> Add devlink reload action to allow the user to request a specific reload
> action. The action parameter is optional, if not specified then devlink
> driver re-init action is used (backward compatible).
> Note that when required to do firmware activation some drivers may need
> to reload the driver. On the other hand some drivers may need to reset
> the firmware to reinitialize the driver entities. Therefore, the devlink
> reload command returns the actions which were actually performed.
> Reload actions supported are:
> driver_reinit: driver entities re-initialization, applying devlink-param
>and devlink-resource values.
> fw_activate: firmware activate.
>
> command examples:
> $devlink dev reload pci/:82:00.0 action driver_reinit
> reload_actions_performed:
>   driver_reinit
>
> $devlink dev reload pci/:82:00.0 action fw_activate
> reload_actions_performed:
>   driver_reinit fw_activate
>
> Signed-off-by: Moshe Shemesh 
> ---
> v3 -> v4:
> - Removed fw_activate_no_reset as an action (next patch adds limit
>   levels instead).
> - Renamed actions_done to actions_performed
> v2 -> v3:
> - Replace fw_live_patch action by fw_activate_no_reset
> - Devlink reload returns the actions done over netlink reply
> v1 -> v2:
> - Instead of reload levels driver,fw_reset,fw_live_patch have reload
>   actions driver_reinit,fw_activate,fw_live_patch
> - Remove driver default level, the action driver_reinit is the default
>   action for all drivers
> ---
>  drivers/net/ethernet/mellanox/mlx4/main.c |  14 ++-
>  .../net/ethernet/mellanox/mlx5/core/devlink.c |  15 ++-
>  drivers/net/ethernet/mellanox/mlxsw/core.c|  25 ++--
>  drivers/net/netdevsim/dev.c   |  16 ++-
>  include/net/devlink.h |   7 +-
>  include/uapi/linux/devlink.h  |  19 +++
>  net/core/devlink.c| 111 +-
>  7 files changed, 180 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
> b/drivers/net/ethernet/mellanox/mlx4/main.c
> index 70cf24ba71e4..aadf1676a0ed 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/main.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/main.c
> @@ -3946,6 +3946,7 @@ static int mlx4_restart_one_up(struct pci_dev *pdev, 
> bool reload,
>struct devlink *devlink);
>
>  static int mlx4_devlink_reload_down(struct devlink *devlink, bool 
> netns_change,
> +   enum devlink_reload_action action,
> struct netlink_ext_ack *extack)
>  {
> struct mlx4_priv *priv = devlink_priv(devlink);
> @@ -3962,8 +3963,8 @@ static int mlx4_devlink_reload_down(struct devlink 
> *devlink, bool netns_change,
> return 0;
>  }
>
> -static int mlx4_devlink_reload_up(struct devlink *devlink,
> - struct netlink_ext_ack *extack)
> +static int mlx4_devlink_reload_up(struct devlink *devlink, enum 
> devlink_reload_action action,
> + struct netlink_ext_ack *extack, unsigned 
> long *actions_performed)
>  {
> struct mlx4_priv *priv = devlink_priv(devlink);
> struct mlx4_dev *dev = >dev;
> @@ -3971,15 +3972,20 @@ static int mlx4_devlink_reload_up(struct devlink 
> *devlink,
> int err;
>
> err = mlx4_restart_one_up(persist->pdev, true, devlink);
> -   if (err)
> +   if (err) {
> mlx4_err(persist->dev, "mlx4_restart_one_up failed, ret=%d\n",
>  err);
> +   return err;
> +   }
> +   if (actions_performed)
> +   *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
>
> -   return err;
> +   return 0;
>  }
>
>  static const struct devlink_ops mlx4_devlink_ops = {
> .port_type_set  = mlx4_devlink_port_type_set,
> +   .supported_reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT),
> .reload_down= mlx4_devlink_reload_down,
> .reload_up  = mlx4_devlink_reload_up,
>  };
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
> index c709e9a385f6..9cd6b6c884e3 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
> @@ -89,6 +89,7 @@ mlx5_devlink_info_get(struct devlink *devlink, struct 
> devlink_info_req *req,
>  }
>
>  static int mlx5_devlink_reload_down(struct devlink *devlink, bool 
> netns_change,
> +   enum devlink_reload_action action,
> struct netlink_ext_ack *extack)
>  {
> struct mlx5_core_dev *dev = devlink_priv(devlink);
> @@ -97,12 +98,19 @@ static int mlx5_devlink_reload_down(struct devlink 
> *devlink, bool netns_change,
> return 0;
>  }
>
> -static int mlx5_devlink_reload_up(struct devlink 

Re: [PATCH net-next RFC v3 01/14] devlink: Add reload action option to devlink reload command

2020-09-09 Thread Vasundhara Volam
On Wed, Sep 9, 2020 at 10:51 PM Moshe Shemesh  wrote:
>
>
> On 9/7/2020 8:58 PM, Jakub Kicinski wrote:
> > On Mon, 7 Sep 2020 16:46:01 +0300 Moshe Shemesh wrote:
> >>> In that sense I don't like --live because it doesn't really say much.
> >>> AFAIU it means 1) no link flap; 2) < 2 sec datapath downtime; 3) no
> >>> configuration is lost in kernel or device (including netdev config,
> >>> link config, flow rules, counters etc.). I was hoping at least the
> >>> documentation in patch 14 would be more precise.
> >> Actually, while writing "no-reset" or "live-patching" I meant also no
> >> downtime at all and nothing resets (config, rules ... anything), that
> >> fits mlx5 live-patching.
> >>
> >> However, to make it more generic,  I can allow few seconds downtime and
> >> add similar constrains as you mentioned here to "no-reset". I will add
> >> that to the documentation patch.
> > Oh! If your device supports no downtime and packet loss at all that's
> > great. You don't have to weaken the definition now, whoever needs a
> > weaker definition can add a different constraint level later, no?
>
>
> Yes, but if we are thinking there will be more levels, maybe the flag
> "--live" or "--no_reset" is less extendable, we may need new attr. I
> mean should I have uAPI command line like:
>
> $ devlink dev reload DEV [ netns { PID | NAME | ID } ] [ action {
> driver_reinit | fw_activate } [ limit_level  no_reset ] ]
>
This sounds good. As coming to our device, user can issue

$devlink dev reload DEV action fw_activate

which resets both firmware and driver entities to activate the new
firmware (either pending flashed firmware or reset current firmware).

Thanks for the patch series.


Re: [PATCH net-next RFC 00/13] Add devlink reload level option

2020-08-12 Thread Vasundhara Volam
On Wed, Aug 5, 2020 at 1:51 PM Moshe Shemesh  wrote:
>
>
> On 8/5/2020 9:55 AM, Vasundhara Volam wrote:
> > On Wed, Aug 5, 2020 at 12:02 PM Moshe Shemesh  wrote:
> >>
> >> On 8/4/2020 1:13 PM, Vasundhara Volam wrote:
> >>> On Mon, Aug 3, 2020 at 7:23 PM Moshe Shemesh  wrote:
> >>>> On 8/3/2020 3:47 PM, Vasundhara Volam wrote:
> >>>>> On Mon, Aug 3, 2020 at 5:47 PM Moshe Shemesh  wrote:
> >>>>>> On 8/3/2020 1:24 PM, Vasundhara Volam wrote:
> >>>>>>> On Tue, Jul 28, 2020 at 10:13 PM Jacob Keller 
> >>>>>>>  wrote:
> >>>>>>>> On 7/27/2020 10:25 PM, Vasundhara Volam wrote:
> >>>>>>>>> On Mon, Jul 27, 2020 at 4:36 PM Moshe Shemesh  
> >>>>>>>>> wrote:
> >>>>>>>>>> Introduce new option on devlink reload API to enable the user to 
> >>>>>>>>>> select the
> >>>>>>>>>> reload level required. Complete support for all levels in mlx5.
> >>>>>>>>>> The following reload levels are supported:
> >>>>>>>>>>   driver: Driver entities re-instantiation only.
> >>>>>>>>>>   fw_reset: Firmware reset and driver entities 
> >>>>>>>>>> re-instantiation.
> >>>>>>>>> The Name is a little confusing. I think it should be renamed to
> >>>>>>>>> fw_live_reset (in which both firmware and driver entities are
> >>>>>>>>> re-instantiated).  For only fw_reset, the driver should not undergo
> >>>>>>>>> reset (it requires a driver reload for firmware to undergo reset).
> >>>>>>>>>
> >>>>>>>> So, I think the differentiation here is that "live_patch" doesn't 
> >>>>>>>> reset
> >>>>>>>> anything.
> >>>>>>> This seems similar to flashing the firmware and does not reset 
> >>>>>>> anything.
> >>>>>> The live patch is activating fw change without reset.
> >>>>>>
> >>>>>> It is not suitable for any fw change but fw gaps which don't require 
> >>>>>> reset.
> >>>>>>
> >>>>>> I can query the fw to check if the pending image change is suitable or
> >>>>>> require fw reset.
> >>>>> Okay.
> >>>>>>>>>>   fw_live_patch: Firmware live patching only.
> >>>>>>>>> This level is not clear. Is this similar to flashing??
> >>>>>>>>>
> >>>>>>>>> Also I have a basic query. The reload command is split into
> >>>>>>>>> reload_up/reload_down handlers (Please correct me if this behaviour 
> >>>>>>>>> is
> >>>>>>>>> changed with this patchset). What if the vendor specific driver does
> >>>>>>>>> not support up/down and needs only a single handler to fire a 
> >>>>>>>>> firmware
> >>>>>>>>> reset or firmware live reset command?
> >>>>>>>> In the "reload_down" handler, they would trigger the appropriate 
> >>>>>>>> reset,
> >>>>>>>> and quiesce anything that needs to be done. Then on reload up, it 
> >>>>>>>> would
> >>>>>>>> restore and bring up anything quiesced in the first stage.
> >>>>>>> Yes, I got the "reload_down" and "reload_up". Similar to the device
> >>>>>>> "remove" and "re-probe" respectively.
> >>>>>>>
> >>>>>>> But our requirement is a similar "ethtool reset" command, where
> >>>>>>> ethtool calls a single callback in driver and driver just sends a
> >>>>>>> firmware command for doing the reset. Once firmware receives the
> >>>>>>> command, it will initiate the reset of driver and firmware entities
> >>>>>>> asynchronously.
> >>>>>> It is similar to mlx5 case here for fw_reset. The driver triggers the 
> >>>>>> fw
> >>>>>> command to reset and all PFs drivers gets events to handle and do
> >

Re: [PATCH net-next RFC 00/13] Add devlink reload level option

2020-08-05 Thread Vasundhara Volam
On Wed, Aug 5, 2020 at 12:02 PM Moshe Shemesh  wrote:
>
>
> On 8/4/2020 1:13 PM, Vasundhara Volam wrote:
> > On Mon, Aug 3, 2020 at 7:23 PM Moshe Shemesh  wrote:
> >>
> >> On 8/3/2020 3:47 PM, Vasundhara Volam wrote:
> >>> On Mon, Aug 3, 2020 at 5:47 PM Moshe Shemesh  wrote:
> >>>> On 8/3/2020 1:24 PM, Vasundhara Volam wrote:
> >>>>> On Tue, Jul 28, 2020 at 10:13 PM Jacob Keller 
> >>>>>  wrote:
> >>>>>> On 7/27/2020 10:25 PM, Vasundhara Volam wrote:
> >>>>>>> On Mon, Jul 27, 2020 at 4:36 PM Moshe Shemesh  
> >>>>>>> wrote:
> >>>>>>>> Introduce new option on devlink reload API to enable the user to 
> >>>>>>>> select the
> >>>>>>>> reload level required. Complete support for all levels in mlx5.
> >>>>>>>> The following reload levels are supported:
> >>>>>>>>  driver: Driver entities re-instantiation only.
> >>>>>>>>  fw_reset: Firmware reset and driver entities re-instantiation.
> >>>>>>> The Name is a little confusing. I think it should be renamed to
> >>>>>>> fw_live_reset (in which both firmware and driver entities are
> >>>>>>> re-instantiated).  For only fw_reset, the driver should not undergo
> >>>>>>> reset (it requires a driver reload for firmware to undergo reset).
> >>>>>>>
> >>>>>> So, I think the differentiation here is that "live_patch" doesn't reset
> >>>>>> anything.
> >>>>> This seems similar to flashing the firmware and does not reset anything.
> >>>> The live patch is activating fw change without reset.
> >>>>
> >>>> It is not suitable for any fw change but fw gaps which don't require 
> >>>> reset.
> >>>>
> >>>> I can query the fw to check if the pending image change is suitable or
> >>>> require fw reset.
> >>> Okay.
> >>>>>>>>  fw_live_patch: Firmware live patching only.
> >>>>>>> This level is not clear. Is this similar to flashing??
> >>>>>>>
> >>>>>>> Also I have a basic query. The reload command is split into
> >>>>>>> reload_up/reload_down handlers (Please correct me if this behaviour is
> >>>>>>> changed with this patchset). What if the vendor specific driver does
> >>>>>>> not support up/down and needs only a single handler to fire a firmware
> >>>>>>> reset or firmware live reset command?
> >>>>>> In the "reload_down" handler, they would trigger the appropriate reset,
> >>>>>> and quiesce anything that needs to be done. Then on reload up, it would
> >>>>>> restore and bring up anything quiesced in the first stage.
> >>>>> Yes, I got the "reload_down" and "reload_up". Similar to the device
> >>>>> "remove" and "re-probe" respectively.
> >>>>>
> >>>>> But our requirement is a similar "ethtool reset" command, where
> >>>>> ethtool calls a single callback in driver and driver just sends a
> >>>>> firmware command for doing the reset. Once firmware receives the
> >>>>> command, it will initiate the reset of driver and firmware entities
> >>>>> asynchronously.
> >>>> It is similar to mlx5 case here for fw_reset. The driver triggers the fw
> >>>> command to reset and all PFs drivers gets events to handle and do
> >>>> re-initialization.  To fit it to the devlink reload_down and reload_up,
> >>>> I wait for the event handler to complete and it stops at driver unload
> >>>> to have the driver up by devlink reload_up. See patch 8 in this patchset.
> >>>>
> >>> Yes, I see reload_down is triggering the reset. In our driver, after
> >>> triggering the reset through a firmware command, reset is done in
> >>> another context as the driver initiates the reset only after receiving
> >>> an ASYNC event from the firmware.
> >>
> >> Same here.
> >>
> >>> Probably, we have to use reload_down() to send firmware command to
> >>> trigger reset and do nothing in reload_up.
> >> I had that in previous version, but it

Re: [PATCH net-next RFC 00/13] Add devlink reload level option

2020-08-04 Thread Vasundhara Volam
On Mon, Aug 3, 2020 at 7:23 PM Moshe Shemesh  wrote:
>
>
> On 8/3/2020 3:47 PM, Vasundhara Volam wrote:
> > On Mon, Aug 3, 2020 at 5:47 PM Moshe Shemesh  wrote:
> >>
> >> On 8/3/2020 1:24 PM, Vasundhara Volam wrote:
> >>> On Tue, Jul 28, 2020 at 10:13 PM Jacob Keller  
> >>> wrote:
> >>>>
> >>>> On 7/27/2020 10:25 PM, Vasundhara Volam wrote:
> >>>>> On Mon, Jul 27, 2020 at 4:36 PM Moshe Shemesh  
> >>>>> wrote:
> >>>>>> Introduce new option on devlink reload API to enable the user to 
> >>>>>> select the
> >>>>>> reload level required. Complete support for all levels in mlx5.
> >>>>>> The following reload levels are supported:
> >>>>>> driver: Driver entities re-instantiation only.
> >>>>>> fw_reset: Firmware reset and driver entities re-instantiation.
> >>>>> The Name is a little confusing. I think it should be renamed to
> >>>>> fw_live_reset (in which both firmware and driver entities are
> >>>>> re-instantiated).  For only fw_reset, the driver should not undergo
> >>>>> reset (it requires a driver reload for firmware to undergo reset).
> >>>>>
> >>>> So, I think the differentiation here is that "live_patch" doesn't reset
> >>>> anything.
> >>> This seems similar to flashing the firmware and does not reset anything.
> >>
> >> The live patch is activating fw change without reset.
> >>
> >> It is not suitable for any fw change but fw gaps which don't require reset.
> >>
> >> I can query the fw to check if the pending image change is suitable or
> >> require fw reset.
> > Okay.
> >>>>>> fw_live_patch: Firmware live patching only.
> >>>>> This level is not clear. Is this similar to flashing??
> >>>>>
> >>>>> Also I have a basic query. The reload command is split into
> >>>>> reload_up/reload_down handlers (Please correct me if this behaviour is
> >>>>> changed with this patchset). What if the vendor specific driver does
> >>>>> not support up/down and needs only a single handler to fire a firmware
> >>>>> reset or firmware live reset command?
> >>>> In the "reload_down" handler, they would trigger the appropriate reset,
> >>>> and quiesce anything that needs to be done. Then on reload up, it would
> >>>> restore and bring up anything quiesced in the first stage.
> >>> Yes, I got the "reload_down" and "reload_up". Similar to the device
> >>> "remove" and "re-probe" respectively.
> >>>
> >>> But our requirement is a similar "ethtool reset" command, where
> >>> ethtool calls a single callback in driver and driver just sends a
> >>> firmware command for doing the reset. Once firmware receives the
> >>> command, it will initiate the reset of driver and firmware entities
> >>> asynchronously.
> >>
> >> It is similar to mlx5 case here for fw_reset. The driver triggers the fw
> >> command to reset and all PFs drivers gets events to handle and do
> >> re-initialization.  To fit it to the devlink reload_down and reload_up,
> >> I wait for the event handler to complete and it stops at driver unload
> >> to have the driver up by devlink reload_up. See patch 8 in this patchset.
> >>
> > Yes, I see reload_down is triggering the reset. In our driver, after
> > triggering the reset through a firmware command, reset is done in
> > another context as the driver initiates the reset only after receiving
> > an ASYNC event from the firmware.
>
>
> Same here.
>
> >
> > Probably, we have to use reload_down() to send firmware command to
> > trigger reset and do nothing in reload_up.
> I had that in previous version, but its wrong to use devlink reload this
> way, so I added wait with timeout for the event handling to complete
> before unload_down function ends. See mlx5_fw_wait_fw_reset_done(). Also
> the event handler stops before load back to have that done by devlink
> reload_up.
But "devlink dev reload" will be invoked by the user only on a single
dev handler and all function drivers will be re-instantiated upon the
ASYNC event. reload_down and reload_up are invoked only the function
which the user invoked.

Take an example of a 2-port (PF0 and PF1) adapter on a single host and
with some VFs loaded on the device. User invokes "devlink dev reload"
on PF0, ASYNC event is received on 2 PFs and VFs for reset. All the
function drivers will be re-instantiated including PF0.

If we wait for some time in reload_down() of PF0 and then call load in
reload_up(), this code will be different from other function drivers.

> >   And returning from reload
> > does not mean that reset is complete as it is done in another context
> > and the driver notifies the health reporter once the reset is
> > complete. devlink framework may have to allow drivers to implement
> > reload_down only to look more clean or call reload_up only if the
> > driver notifies the devlink once reset is completed from another
> > context. Please suggest.


Re: [PATCH net-next RFC 00/13] Add devlink reload level option

2020-08-03 Thread Vasundhara Volam
On Mon, Aug 3, 2020 at 5:47 PM Moshe Shemesh  wrote:
>
>
> On 8/3/2020 1:24 PM, Vasundhara Volam wrote:
> > On Tue, Jul 28, 2020 at 10:13 PM Jacob Keller  
> > wrote:
> >>
> >>
> >> On 7/27/2020 10:25 PM, Vasundhara Volam wrote:
> >>> On Mon, Jul 27, 2020 at 4:36 PM Moshe Shemesh  wrote:
> >>>> Introduce new option on devlink reload API to enable the user to select 
> >>>> the
> >>>> reload level required. Complete support for all levels in mlx5.
> >>>> The following reload levels are supported:
> >>>>driver: Driver entities re-instantiation only.
> >>>>fw_reset: Firmware reset and driver entities re-instantiation.
> >>> The Name is a little confusing. I think it should be renamed to
> >>> fw_live_reset (in which both firmware and driver entities are
> >>> re-instantiated).  For only fw_reset, the driver should not undergo
> >>> reset (it requires a driver reload for firmware to undergo reset).
> >>>
> >> So, I think the differentiation here is that "live_patch" doesn't reset
> >> anything.
> > This seems similar to flashing the firmware and does not reset anything.
>
>
> The live patch is activating fw change without reset.
>
> It is not suitable for any fw change but fw gaps which don't require reset.
>
> I can query the fw to check if the pending image change is suitable or
> require fw reset.
Okay.
>
> >>>>fw_live_patch: Firmware live patching only.
> >>> This level is not clear. Is this similar to flashing??
> >>>
> >>> Also I have a basic query. The reload command is split into
> >>> reload_up/reload_down handlers (Please correct me if this behaviour is
> >>> changed with this patchset). What if the vendor specific driver does
> >>> not support up/down and needs only a single handler to fire a firmware
> >>> reset or firmware live reset command?
> >> In the "reload_down" handler, they would trigger the appropriate reset,
> >> and quiesce anything that needs to be done. Then on reload up, it would
> >> restore and bring up anything quiesced in the first stage.
> > Yes, I got the "reload_down" and "reload_up". Similar to the device
> > "remove" and "re-probe" respectively.
> >
> > But our requirement is a similar "ethtool reset" command, where
> > ethtool calls a single callback in driver and driver just sends a
> > firmware command for doing the reset. Once firmware receives the
> > command, it will initiate the reset of driver and firmware entities
> > asynchronously.
>
>
> It is similar to mlx5 case here for fw_reset. The driver triggers the fw
> command to reset and all PFs drivers gets events to handle and do
> re-initialization.  To fit it to the devlink reload_down and reload_up,
> I wait for the event handler to complete and it stops at driver unload
> to have the driver up by devlink reload_up. See patch 8 in this patchset.
>
Yes, I see reload_down is triggering the reset. In our driver, after
triggering the reset through a firmware command, reset is done in
another context as the driver initiates the reset only after receiving
an ASYNC event from the firmware.

Probably, we have to use reload_down() to send firmware command to
trigger reset and do nothing in reload_up. And returning from reload
does not mean that reset is complete as it is done in another context
and the driver notifies the health reporter once the reset is
complete. devlink framework may have to allow drivers to implement
reload_down only to look more clean or call reload_up only if the
driver notifies the devlink once reset is completed from another
context. Please suggest.


Re: [PATCH net-next RFC 00/13] Add devlink reload level option

2020-08-03 Thread Vasundhara Volam
On Tue, Jul 28, 2020 at 10:13 PM Jacob Keller  wrote:
>
>
>
> On 7/27/2020 10:25 PM, Vasundhara Volam wrote:
> > On Mon, Jul 27, 2020 at 4:36 PM Moshe Shemesh  wrote:
> >>
> >> Introduce new option on devlink reload API to enable the user to select the
> >> reload level required. Complete support for all levels in mlx5.
> >> The following reload levels are supported:
> >>   driver: Driver entities re-instantiation only.
> >>   fw_reset: Firmware reset and driver entities re-instantiation.
> > The Name is a little confusing. I think it should be renamed to
> > fw_live_reset (in which both firmware and driver entities are
> > re-instantiated).  For only fw_reset, the driver should not undergo
> > reset (it requires a driver reload for firmware to undergo reset).
> >
>
> So, I think the differentiation here is that "live_patch" doesn't reset
> anything.
This seems similar to flashing the firmware and does not reset anything.

>
> >>   fw_live_patch: Firmware live patching only.
> > This level is not clear. Is this similar to flashing??
> >
> > Also I have a basic query. The reload command is split into
> > reload_up/reload_down handlers (Please correct me if this behaviour is
> > changed with this patchset). What if the vendor specific driver does
> > not support up/down and needs only a single handler to fire a firmware
> > reset or firmware live reset command?
>
> In the "reload_down" handler, they would trigger the appropriate reset,
> and quiesce anything that needs to be done. Then on reload up, it would
> restore and bring up anything quiesced in the first stage.
Yes, I got the "reload_down" and "reload_up". Similar to the device
"remove" and "re-probe" respectively.

But our requirement is a similar "ethtool reset" command, where
ethtool calls a single callback in driver and driver just sends a
firmware command for doing the reset. Once firmware receives the
command, it will initiate the reset of driver and firmware entities
asynchronously.


Re: [PATCH net-next RFC 00/13] Add devlink reload level option

2020-07-27 Thread Vasundhara Volam
On Mon, Jul 27, 2020 at 4:36 PM Moshe Shemesh  wrote:
>
> Introduce new option on devlink reload API to enable the user to select the
> reload level required. Complete support for all levels in mlx5.
> The following reload levels are supported:
>   driver: Driver entities re-instantiation only.
>   fw_reset: Firmware reset and driver entities re-instantiation.
The Name is a little confusing. I think it should be renamed to
fw_live_reset (in which both firmware and driver entities are
re-instantiated).  For only fw_reset, the driver should not undergo
reset (it requires a driver reload for firmware to undergo reset).

>   fw_live_patch: Firmware live patching only.
This level is not clear. Is this similar to flashing??

Also I have a basic query. The reload command is split into
reload_up/reload_down handlers (Please correct me if this behaviour is
changed with this patchset). What if the vendor specific driver does
not support up/down and needs only a single handler to fire a firmware
reset or firmware live reset command?
>
> Each driver which support this command should expose the reload levels
> supported and the driver's default reload level.
> The uAPI is backward compatible, if the reload level option is omitted
> from the reload command, the driver's default reload level will be used.
>
> Patch 1 adds the new API reload level option to devlink.
> Patch 2 exposes the supported reload levels and default level on devlink
> dev get.
> Patches 3-8 add support on mlx5 for devlink reload level fw-reset and
> handle the firmware reset events.
> Patches 9-10 add devlink enable remote dev reset parameter and use it
>  in mlx5.
> Patches 11-12 mlx5 add devlink reload live patch support and event
>   handling.
> Patch 13 adds documentation file devlink-reload.rst
>
> Command examples:
>
> # Run reload command with fw-reset reload level:
> $ devlink dev reload pci/:82:00.0 level fw-reset
>
> # Run reload command with driver reload level:
> $ devlink dev reload pci/:82:00.0 level driver
>
> # Run reload command with driver's default level (backward compatible):
> $ devlink dev reload pci/:82:00.0
>
>
> Moshe Shemesh (13):
>   devlink: Add reload level option to devlink reload command
>   devlink: Add reload levels data to dev get
>   net/mlx5: Add functions to set/query MFRL register
>   net/mlx5: Set cap for pci sync for fw update event
>   net/mlx5: Handle sync reset request event
>   net/mlx5: Handle sync reset now event
>   net/mlx5: Handle sync reset abort event
>   net/mlx5: Add support for devlink reload level fw reset
>   devlink: Add enable_remote_dev_reset generic parameter
>   net/mlx5: Add devlink param enable_remote_dev_reset support
>   net/mlx5: Add support for fw live patch event
>   net/mlx5: Add support for devlink reload level live patch
>   devlink: Add Documentation/networking/devlink/devlink-reload.rst
>
>  .../networking/devlink/devlink-params.rst |   6 +
>  .../networking/devlink/devlink-reload.rst |  56 +++
>  Documentation/networking/devlink/index.rst|   1 +
>  drivers/net/ethernet/mellanox/mlx4/main.c |   6 +-
>  .../net/ethernet/mellanox/mlx5/core/Makefile  |   2 +-
>  .../net/ethernet/mellanox/mlx5/core/devlink.c | 114 +-
>  .../mellanox/mlx5/core/diag/fw_tracer.c   |  31 ++
>  .../mellanox/mlx5/core/diag/fw_tracer.h   |   1 +
>  .../ethernet/mellanox/mlx5/core/fw_reset.c| 328 ++
>  .../ethernet/mellanox/mlx5/core/fw_reset.h|  17 +
>  .../net/ethernet/mellanox/mlx5/core/health.c  |  74 +++-
>  .../net/ethernet/mellanox/mlx5/core/main.c|  13 +
>  drivers/net/ethernet/mellanox/mlxsw/core.c|   6 +-
>  drivers/net/netdevsim/dev.c   |   6 +-
>  include/linux/mlx5/device.h   |   1 +
>  include/linux/mlx5/driver.h   |  12 +
>  include/net/devlink.h |  10 +-
>  include/uapi/linux/devlink.h  |  22 ++
>  net/core/devlink.c|  95 -
>  19 files changed, 764 insertions(+), 37 deletions(-)
>  create mode 100644 Documentation/networking/devlink/devlink-reload.rst
>  create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
>  create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h
>
> --
> 2.17.1
>


Re: [PATCH v2 04/15] bnxt: use new module_firmware_crashed()

2020-05-15 Thread Vasundhara Volam
On Sat, May 16, 2020 at 3:00 AM Luis Chamberlain  wrote:
>
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
>
> Using a taint flag allows us to annotate when this happens clearly.
>
> Cc: Michael Chan 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
> b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> index dd0c3f227009..5ba1bd0734e9 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> @@ -3503,6 +3503,7 @@ static int bnxt_get_dump_data(struct net_device *dev, 
> struct ethtool_dump *dump,
>
> dump->flag = bp->dump_flag;
> if (dump->flag == BNXT_DUMP_CRASH) {
> +   module_firmware_crashed();
This is not the right place to annotate the taint flag.

Here the driver is just copying the dump after error recovery which is collected
by firmware to DDR, when firmware detects fatal conditions. Driver and firmware
will be healthy when the user calls this command.

Also, users can call this command a thousand times when there is no crash.

I will propose a patch to use this wrapper in the error recovery path,
where the driver
may not be able to recover.

>  #ifdef CONFIG_TEE_BNXT_FW
>     return tee_bnxt_copy_coredump(buf, 0, dump->len);
>  #endif
> --
> 2.26.2
>
Nacked-by: Vasundhara Volam 


Re: [PATCH V2 3/3] bnxt_en: Add support to collect crash dump via ethtool

2019-10-20 Thread Vasundhara Volam
On Fri, Oct 18, 2019 at 10:31 PM Jakub Kicinski
 wrote:
>
> On Fri, 18 Oct 2019 12:04:35 +0530, Vasundhara Volam wrote:
> > On Fri, Oct 18, 2019 at 12:52 AM Jakub Kicinski wrote:
> > > On Thu, 17 Oct 2019 17:31:22 +0530, Sheetal Tigadoli wrote:
> > > > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
> > > > b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> > > > index 51c1404..1596221 100644
> > > > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> > > > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> > > > @@ -3311,6 +3311,23 @@ static int bnxt_get_coredump(struct bnxt *bp, 
> > > > void *buf, u32 *dump_len)
> > > >   return rc;
> > > >  }
> > > >
> > > > +static int bnxt_set_dump(struct net_device *dev, struct ethtool_dump 
> > > > *dump)
> > > > +{
> > > > + struct bnxt *bp = netdev_priv(dev);
> > > > +
> > > > +#ifndef CONFIG_TEE_BNXT_FW
> > > > + return -EOPNOTSUPP;
> > > > +#endif
> > >
> > > if (!IS_ENABLED(...))
> > > return x;
> > >
> > > reads better IMHO
> > Okay.
> >
> > >
> > > But also you seem to be breaking live dump for systems with
> > > CONFIG_TEE_BNXT_FW=n
> > Yes, we are supporting set_dump only if crash dump is supported.
>
> It's wrong.
Sorry not very clear. You are saying that support set_dump all the
time and return
error, if the config option is not enabled? If yes, I will modify the
same way as it
makes sense.

>
> > > > + if (dump->flag > BNXT_DUMP_CRASH) {
> > > > + netdev_err(dev, "Supports only Live(0) and Crash(1) 
> > > > dumps.\n");
> > >
> > > more of an _info than _err, if at all
> > I made this err, as we are returning error on invalid flag value. I
> > can modify the log to
> > something like "Invalid dump flag. Supports only Live(0) and Crash(1)
> > dumps.\n" to make
> > it more like error log.
>
> Not an error.
Okay.


Re: [PATCH V2 3/3] bnxt_en: Add support to collect crash dump via ethtool

2019-10-18 Thread Vasundhara Volam
On Fri, Oct 18, 2019 at 12:52 AM Jakub Kicinski
 wrote:
>
> On Thu, 17 Oct 2019 17:31:22 +0530, Sheetal Tigadoli wrote:
> > From: Vasundhara Volam 
> >
> > Driver supports 2 types of core dumps.
> >
> > 1. Live dump - Firmware dump when system is up and running.
> > 2. Crash dump - Dump which is collected during firmware crash
> > that can be retrieved after recovery.
> > Crash dump is currently supported only on specific 58800 chips
> > which can be retrieved using OP-TEE API only, as firmware cannot
> > access this region directly.
> >
> > User needs to set the dump flag using following command before
> > initiating the dump collection:
> >
> > $ ethtool -W|--set-dump eth0 N
> >
> > Where N is "0" for live dump and "1" for crash dump
> >
> > Command to collect the dump after setting the flag:
> >
> > $ ethtool -w eth0 data Filename
> >
> > Cc: Michael Chan 
> > Signed-off-by: Vasundhara Volam 
> > Signed-off-by: Sheetal Tigadoli 
> > ---
> >  drivers/net/ethernet/broadcom/bnxt/bnxt.h |  3 ++
> >  drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 36 
> > +--
> >  drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.h |  2 ++
> >  3 files changed, 39 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h 
> > b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > index 0943715..3e7d1fb 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > @@ -1807,6 +1807,9 @@ struct bnxt {
> >
> >   u8  num_leds;
> >   struct bnxt_led_infoleds[BNXT_MAX_LED];
> > + u16 dump_flag;
> > +#define BNXT_DUMP_LIVE   0
> > +#define BNXT_DUMP_CRASH  1
> >
> >   struct bpf_prog *xdp_prog;
> >
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
> > b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> > index 51c1404..1596221 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> > @@ -3311,6 +3311,23 @@ static int bnxt_get_coredump(struct bnxt *bp, void 
> > *buf, u32 *dump_len)
> >   return rc;
> >  }
> >
> > +static int bnxt_set_dump(struct net_device *dev, struct ethtool_dump *dump)
> > +{
> > + struct bnxt *bp = netdev_priv(dev);
> > +
> > +#ifndef CONFIG_TEE_BNXT_FW
> > + return -EOPNOTSUPP;
> > +#endif
>
> if (!IS_ENABLED(...))
> return x;
>
> reads better IMHO
Okay.

>
> But also you seem to be breaking live dump for systems with
> CONFIG_TEE_BNXT_FW=n
Yes, we are supporting set_dump only if crash dump is supported.

>
> > + if (dump->flag > BNXT_DUMP_CRASH) {
> > + netdev_err(dev, "Supports only Live(0) and Crash(1) 
> > dumps.\n");
>
> more of an _info than _err, if at all
I made this err, as we are returning error on invalid flag value. I
can modify the log to
something like "Invalid dump flag. Supports only Live(0) and Crash(1)
dumps.\n" to make
it more like error log.

>
> > + return -EINVAL;
> > + }
> > +
> > + bp->dump_flag = dump->flag;
> > + return 0;
> > +}
> > +
> >  static int bnxt_get_dump_flag(struct net_device *dev, struct ethtool_dump 
> > *dump)
> >  {
> >   struct bnxt *bp = netdev_priv(dev);
> > @@ -3323,7 +3340,12 @@ static int bnxt_get_dump_flag(struct net_device 
> > *dev, struct ethtool_dump *dump)
> >   bp->ver_resp.hwrm_fw_bld_8b << 8 |
> >   bp->ver_resp.hwrm_fw_rsvd_8b;
> >
> > - return bnxt_get_coredump(bp, NULL, >len);
> > + dump->flag = bp->dump_flag;
> > + if (bp->dump_flag == BNXT_DUMP_CRASH)
> > + dump->len = BNXT_CRASH_DUMP_LEN;
> > + else
> > + bnxt_get_coredump(bp, NULL, >len);
> > + return 0;
> >  }
> >
> >  static int bnxt_get_dump_data(struct net_device *dev, struct ethtool_dump 
> > *dump,
> > @@ -3336,7 +3358,16 @@ static int bnxt_get_dump_data(struct net_device 
> > *dev, struct ethtool_dump *dump,
> >
> >   memset(buf, 0, dump->len);
> >
> > - return bnxt_get_coredump(bp, buf, >len);
> > + dump->flag = bp->dump_flag;
> > + if (dump->flag == BNXT_DUM