On Thu, Nov 17, 2022 at 04:09:36PM -0500, Alexander Motin wrote:
> Previous code reported "smart" state based on number of bits
> set in the module health field.  But actually any single bit
> set there already means critical failure.  Rework the logic
> according to specification, properly reporting non-critical
> state in case of warning threshold reached, critical in case
> of any module health bit set or error threshold reached and
> fatal if NVDIMM exhausted its life time.  In attempt to
> report the cause of failure in absence of better methods,
> report reached thresholds as more or less matching alarms.
> 
> While there clean up the code, making it more uniform with
> others and allowing more methods to be implemented later.

Hi Alexander,

Perhaps this would be better presented in 2 patches:
1)the cleanup and then 2) improvement of smart state reporting.

Alison

> 
> Signed-off-by: Alexander Motin <[email protected]>
> ---
>  ndctl/lib/msft.c | 111 ++++++++++++++++++++++++++++++++---------------
>  ndctl/lib/msft.h |  13 ++----
>  2 files changed, 79 insertions(+), 45 deletions(-)
> 
> diff --git a/ndctl/lib/msft.c b/ndctl/lib/msft.c
> index 3112799..8f66c97 100644
> --- a/ndctl/lib/msft.c
> +++ b/ndctl/lib/msft.c
> @@ -2,6 +2,7 @@
>  // Copyright (C) 2016-2017 Dell, Inc.
>  // Copyright (C) 2016 Hewlett Packard Enterprise Development LP
>  // Copyright (C) 2016-2020, Intel Corporation.
> +/* Copyright (C) 2022 iXsystems, Inc. */
>  #include <stdlib.h>
>  #include <limits.h>
>  #include <util/log.h>
> @@ -12,12 +13,39 @@
>  #define CMD_MSFT(_c) ((_c)->msft)
>  #define CMD_MSFT_SMART(_c) (CMD_MSFT(_c)->u.smart.data)
>  
> +static const char *msft_cmd_desc(int fn)
> +{
> +     static const char * const descs[] = {
> +             [NDN_MSFT_CMD_CHEALTH] = "critical_health",
> +             [NDN_MSFT_CMD_NHEALTH] = "nvdimm_health",
> +             [NDN_MSFT_CMD_EHEALTH] = "es_health",
> +     };
> +     const char *desc;
> +
> +     if (fn >= (int) ARRAY_SIZE(descs))
> +             return "unknown";
> +     desc = descs[fn];
> +     if (!desc)
> +             return "unknown";
> +     return desc;
> +}
> +
> +static bool msft_cmd_is_supported(struct ndctl_dimm *dimm, int cmd)
> +{
> +     /* Handle this separately to support monitor mode */
> +     if (cmd == ND_CMD_SMART)
> +             return true;
> +
> +     return !!(dimm->cmd_mask & (1ULL << cmd));
> +}
> +
>  static u32 msft_get_firmware_status(struct ndctl_cmd *cmd)
>  {
>       return cmd->msft->u.smart.status;
>  }
>  
> -static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
> +static struct ndctl_cmd *alloc_msft_cmd(struct ndctl_dimm *dimm,
> +             unsigned int func, size_t in_size, size_t out_size)
>  {
>       struct ndctl_bus *bus = ndctl_dimm_get_bus(dimm);
>       struct ndctl_ctx *ctx = ndctl_bus_get_ctx(bus);
> @@ -30,12 +58,12 @@ static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct 
> ndctl_dimm *dimm)
>               return NULL;
>       }
>  
> -     if (test_dimm_dsm(dimm, NDN_MSFT_CMD_SMART) == DIMM_DSM_UNSUPPORTED) {
> +     if (test_dimm_dsm(dimm, func) == DIMM_DSM_UNSUPPORTED) {
>               dbg(ctx, "unsupported function\n");
>               return NULL;
>       }
>  
> -     size = sizeof(*cmd) + sizeof(struct ndn_pkg_msft);
> +     size = sizeof(*cmd) + sizeof(struct nd_cmd_pkg) + in_size + out_size;
>       cmd = calloc(1, size);
>       if (!cmd)
>               return NULL;
> @@ -45,25 +73,30 @@ static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct 
> ndctl_dimm *dimm)
>       cmd->type = ND_CMD_CALL;
>       cmd->size = size;
>       cmd->status = 1;
> +     cmd->get_firmware_status = msft_get_firmware_status;
>  
>       msft = CMD_MSFT(cmd);
>       msft->gen.nd_family = NVDIMM_FAMILY_MSFT;
> -     msft->gen.nd_command = NDN_MSFT_CMD_SMART;
> +     msft->gen.nd_command = func;
>       msft->gen.nd_fw_size = 0;
> -     msft->gen.nd_size_in = offsetof(struct ndn_msft_smart, status);
> -     msft->gen.nd_size_out = sizeof(msft->u.smart);
> +     msft->gen.nd_size_in = in_size;
> +     msft->gen.nd_size_out = out_size;
>       msft->u.smart.status = 0;
> -     cmd->get_firmware_status = msft_get_firmware_status;
>  
>       return cmd;
>  }
>  
> +static struct ndctl_cmd *msft_dimm_cmd_new_smart(struct ndctl_dimm *dimm)
> +{
> +     return (alloc_msft_cmd(dimm, NDN_MSFT_CMD_NHEALTH,
> +                     0, sizeof(struct ndn_msft_smart)));
> +}
> +
>  static int msft_smart_valid(struct ndctl_cmd *cmd)
>  {
>       if (cmd->type != ND_CMD_CALL ||
> -         cmd->size != sizeof(*cmd) + sizeof(struct ndn_pkg_msft) ||
>           CMD_MSFT(cmd)->gen.nd_family != NVDIMM_FAMILY_MSFT ||
> -         CMD_MSFT(cmd)->gen.nd_command != NDN_MSFT_CMD_SMART ||
> +         CMD_MSFT(cmd)->gen.nd_command != NDN_MSFT_CMD_NHEALTH ||
>           cmd->status != 0)
>               return cmd->status < 0 ? cmd->status : -EINVAL;
>       return 0;
> @@ -80,28 +113,33 @@ static unsigned int msft_cmd_smart_get_flags(struct 
> ndctl_cmd *cmd)
>       }
>  
>       /* below health data can be retrieved via MSFT _DSM function 11 */
> -     return NDN_MSFT_SMART_HEALTH_VALID |
> -             NDN_MSFT_SMART_TEMP_VALID |
> -             NDN_MSFT_SMART_USED_VALID;
> +     return ND_SMART_HEALTH_VALID | ND_SMART_TEMP_VALID |
> +         ND_SMART_USED_VALID | ND_SMART_ALARM_VALID;
>  }
>  
> -static unsigned int num_set_bit_health(__u16 num)
> +static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
>  {
> -     int i;
> -     __u16 n = num & 0x7FFF;
> -     unsigned int count = 0;
> +     unsigned int health = 0;
> +     int rc;
>  
> -     for (i = 0; i < 15; i++)
> -             if (!!(n & (1 << i)))
> -                     count++;
> +     rc = msft_smart_valid(cmd);
> +     if (rc < 0) {
> +             errno = -rc;
> +             return UINT_MAX;
> +     }
>  
> -     return count;
> +     if (CMD_MSFT_SMART(cmd)->nvm_lifetime == 0)
> +             health |= ND_SMART_FATAL_HEALTH;
> +     if (CMD_MSFT_SMART(cmd)->health != 0 ||
> +         CMD_MSFT_SMART(cmd)->err_thresh_stat != 0)
> +             health |= ND_SMART_CRITICAL_HEALTH;
> +     if (CMD_MSFT_SMART(cmd)->warn_thresh_stat != 0)
> +             health |= ND_SMART_NON_CRITICAL_HEALTH;
> +     return health;
>  }
>  
> -static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd)
> +static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd 
> *cmd)
>  {
> -     unsigned int health;
> -     unsigned int num;
>       int rc;
>  
>       rc = msft_smart_valid(cmd);
> @@ -110,21 +148,13 @@ static unsigned int msft_cmd_smart_get_health(struct 
> ndctl_cmd *cmd)
>               return UINT_MAX;
>       }
>  
> -     num = num_set_bit_health(CMD_MSFT_SMART(cmd)->health);
> -     if (num == 0)
> -             health = 0;
> -     else if (num < 2)
> -             health = ND_SMART_NON_CRITICAL_HEALTH;
> -     else if (num < 3)
> -             health = ND_SMART_CRITICAL_HEALTH;
> -     else
> -             health = ND_SMART_FATAL_HEALTH;
> -
> -     return health;
> +     return CMD_MSFT_SMART(cmd)->temp * 16;
>  }
>  
> -static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd 
> *cmd)
> +static unsigned int msft_cmd_smart_get_alarm_flags(struct ndctl_cmd *cmd)
>  {
> +     __u8 stat;
> +     unsigned int flags = 0;
>       int rc;
>  
>       rc = msft_smart_valid(cmd);
> @@ -133,7 +163,13 @@ static unsigned int 
> msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd)
>               return UINT_MAX;
>       }
>  
> -     return CMD_MSFT_SMART(cmd)->temp * 16;
> +     stat = CMD_MSFT_SMART(cmd)->err_thresh_stat |
> +         CMD_MSFT_SMART(cmd)->warn_thresh_stat;
> +     if (stat & 3) /* NVM_LIFETIME/ES_LIFETIME */
> +             flags |= ND_SMART_SPARE_TRIP;
> +     if (stat & 4) /* ES_TEMP */
> +             flags |= ND_SMART_CTEMP_TRIP;
> +     return flags;
>  }
>  
>  static unsigned int msft_cmd_smart_get_life_used(struct ndctl_cmd *cmd)
> @@ -171,10 +207,13 @@ static int msft_cmd_xlat_firmware_status(struct 
> ndctl_cmd *cmd)
>  }
>  
>  struct ndctl_dimm_ops * const msft_dimm_ops = &(struct ndctl_dimm_ops) {
> +     .cmd_desc = msft_cmd_desc,
> +     .cmd_is_supported = msft_cmd_is_supported,
>       .new_smart = msft_dimm_cmd_new_smart,
>       .smart_get_flags = msft_cmd_smart_get_flags,
>       .smart_get_health = msft_cmd_smart_get_health,
>       .smart_get_media_temperature = msft_cmd_smart_get_media_temperature,
> +     .smart_get_alarm_flags = msft_cmd_smart_get_alarm_flags,
>       .smart_get_life_used = msft_cmd_smart_get_life_used,
>       .xlat_firmware_status = msft_cmd_xlat_firmware_status,
>  };
> diff --git a/ndctl/lib/msft.h b/ndctl/lib/msft.h
> index 978cc11..8d246a5 100644
> --- a/ndctl/lib/msft.h
> +++ b/ndctl/lib/msft.h
> @@ -2,21 +2,16 @@
>  /* Copyright (C) 2016-2017 Dell, Inc. */
>  /* Copyright (C) 2016 Hewlett Packard Enterprise Development LP */
>  /* Copyright (C) 2014-2020, Intel Corporation. */
> +/* Copyright (C) 2022 iXsystems, Inc. */
>  #ifndef __NDCTL_MSFT_H__
>  #define __NDCTL_MSFT_H__
>  
>  enum {
> -     NDN_MSFT_CMD_QUERY = 0,
> -
> -     /* non-root commands */
> -     NDN_MSFT_CMD_SMART = 11,
> +     NDN_MSFT_CMD_CHEALTH = 10,
> +     NDN_MSFT_CMD_NHEALTH = 11,
> +     NDN_MSFT_CMD_EHEALTH = 12,
>  };
>  
> -/* NDN_MSFT_CMD_SMART */
> -#define NDN_MSFT_SMART_HEALTH_VALID  ND_SMART_HEALTH_VALID
> -#define NDN_MSFT_SMART_TEMP_VALID    ND_SMART_TEMP_VALID
> -#define NDN_MSFT_SMART_USED_VALID    ND_SMART_USED_VALID
> -
>  /*
>   * This is actually function 11 data,
>   * This is the closest I can find to match smart
> -- 
> 2.30.2
> 
> 

Reply via email to