[Dropping Lijun from CC due to a bounce] On Fri, 2023-01-13 at 12:27 -0500, Alexander Motin wrote: > Previous code reported "smart" state based on number of bits > set in the module health field. But actually any single bit > set there already means critical failure. Rework the logic > according to specification, properly reporting non-critical > state in case of warning threshold reached, critical in case > of any module health bit set or error threshold reached and > fatal if NVDIMM exhausted its life time. In attempt to > report the cause of failure in absence of better methods, > report reached thresholds as more or less matching alarms.
This looks like unnecessarily aggressive wrapping. Commit messages are typically wrapped at 72 chars. Other than that this looks fine. > > Signed-off-by: Alexander Motin <[email protected]> > --- > ndctl/lib/msft.c | 55 ++++++++++++++++++++++++++---------------------- > 1 file changed, 30 insertions(+), 25 deletions(-) > > diff --git a/ndctl/lib/msft.c b/ndctl/lib/msft.c > index efc7fe1..8f66c97 100644 > --- a/ndctl/lib/msft.c > +++ b/ndctl/lib/msft.c > @@ -114,26 +114,32 @@ static unsigned int msft_cmd_smart_get_flags(struct > ndctl_cmd *cmd) > > /* below health data can be retrieved via MSFT _DSM function 11 */ > return ND_SMART_HEALTH_VALID | ND_SMART_TEMP_VALID | > - ND_SMART_USED_VALID; > + ND_SMART_USED_VALID | ND_SMART_ALARM_VALID; > } > > -static unsigned int num_set_bit_health(__u16 num) > +static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd) > { > - int i; > - __u16 n = num & 0x7FFF; > - unsigned int count = 0; > + unsigned int health = 0; > + int rc; > > - for (i = 0; i < 15; i++) > - if (!!(n & (1 << i))) > - count++; > + rc = msft_smart_valid(cmd); > + if (rc < 0) { > + errno = -rc; > + return UINT_MAX; > + } > > - return count; > + if (CMD_MSFT_SMART(cmd)->nvm_lifetime == 0) > + health |= ND_SMART_FATAL_HEALTH; > + if (CMD_MSFT_SMART(cmd)->health != 0 || > + CMD_MSFT_SMART(cmd)->err_thresh_stat != 0) > + health |= ND_SMART_CRITICAL_HEALTH; > + if (CMD_MSFT_SMART(cmd)->warn_thresh_stat != 0) > + health |= ND_SMART_NON_CRITICAL_HEALTH; > + return health; > } > > -static unsigned int msft_cmd_smart_get_health(struct ndctl_cmd *cmd) > +static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd > *cmd) > { > - unsigned int health; > - unsigned int num; > int rc; > > rc = msft_smart_valid(cmd); > @@ -142,21 +148,13 @@ static unsigned int msft_cmd_smart_get_health(struct > ndctl_cmd *cmd) > return UINT_MAX; > } > > - num = num_set_bit_health(CMD_MSFT_SMART(cmd)->health); > - if (num == 0) > - health = 0; > - else if (num < 2) > - health = ND_SMART_NON_CRITICAL_HEALTH; > - else if (num < 3) > - health = ND_SMART_CRITICAL_HEALTH; > - else > - health = ND_SMART_FATAL_HEALTH; > - > - return health; > + return CMD_MSFT_SMART(cmd)->temp * 16; > } > > -static unsigned int msft_cmd_smart_get_media_temperature(struct ndctl_cmd > *cmd) > +static unsigned int msft_cmd_smart_get_alarm_flags(struct ndctl_cmd *cmd) > { > + __u8 stat; > + unsigned int flags = 0; > int rc; > > rc = msft_smart_valid(cmd); > @@ -165,7 +163,13 @@ static unsigned int > msft_cmd_smart_get_media_temperature(struct ndctl_cmd *cmd) > return UINT_MAX; > } > > - return CMD_MSFT_SMART(cmd)->temp * 16; > + stat = CMD_MSFT_SMART(cmd)->err_thresh_stat | > + CMD_MSFT_SMART(cmd)->warn_thresh_stat; > + if (stat & 3) /* NVM_LIFETIME/ES_LIFETIME */ > + flags |= ND_SMART_SPARE_TRIP; > + if (stat & 4) /* ES_TEMP */ > + flags |= ND_SMART_CTEMP_TRIP; > + return flags; > } > > static unsigned int msft_cmd_smart_get_life_used(struct ndctl_cmd *cmd) > @@ -209,6 +213,7 @@ struct ndctl_dimm_ops * const msft_dimm_ops = &(struct > ndctl_dimm_ops) { > .smart_get_flags = msft_cmd_smart_get_flags, > .smart_get_health = msft_cmd_smart_get_health, > .smart_get_media_temperature = msft_cmd_smart_get_media_temperature, > + .smart_get_alarm_flags = msft_cmd_smart_get_alarm_flags, > .smart_get_life_used = msft_cmd_smart_get_life_used, > .xlat_firmware_status = msft_cmd_xlat_firmware_status, > };
