Vaibhav Jain <vaib...@linux.ibm.com> writes: > Presently PAPR doesn't support injecting smart errors on an > NVDIMM. This makes testing the NVDIMM health reporting functionality > difficult as simulating NVDIMM health related events need a hacked up > qemu version. > > To solve this problem this patch proposes simulating certain set of > NVDIMM health related events in papr_scm. Specifically 'fatal' health > state and 'dirty' shutdown state. These error can be injected via the > user-space 'ndctl-inject-smart(1)' command. With the proposed patch and > corresponding ndctl patches following command flow is expected: > > $ sudo ndctl list -DH -d nmem0 > ... > "health_state":"ok", > "shutdown_state":"clean", > ... > # inject unsafe shutdown and fatal health error > $ sudo ndctl inject-smart nmem0 -Uf > ... > "health_state":"fatal", > "shutdown_state":"dirty", > ... > # uninject all errors > $ sudo ndctl inject-smart nmem0 -N > ... > "health_state":"ok", > "shutdown_state":"clean", > ... > > The patch adds two members 'health_bitmap_mask' and > 'health_bitmap_override' inside struct papr_scm_priv which are then > bit blt'ed to the health bitmaps fetched from the hypervisor. In case > we are not able to fetch health information from the hypervisor we > service the health bitmap from these two members. These members are > accessible from sysfs at nmemX/papr/health_bitmap_override > > A new PDSM named 'SMART_INJECT' is proposed that accepts newly > introduced 'struct nd_papr_pdsm_smart_inject' as payload thats > exchanged between libndctl and papr_scm to indicate the requested > smart-error states. > > When the processing the PDSM 'SMART_INJECT', papr_pdsm_smart_inject() > constructs a pair or 'mask' and 'override' bitmaps from the payload > and bit-blt it to the 'health_bitmap_{mask, override}' members. This > ensures the after being fetched from the hypervisor, the health_bitmap > reflects requested smart-error states. > > Signed-off-by: Vaibhav Jain <vaib...@linux.ibm.com> > Signed-off-by: Shivaprasad G Bhat <sb...@linux.ibm.com> > --- > Changelog: > > Since v2: > * Rebased the patch to ppc-next > * Added documentation for newly introduced sysfs attribute > 'health_bitmap_override' > > Since v1: > * Updated the patch description. > * Removed dependency of a header movement patch. > * Removed '__packed' attribute for 'struct nd_papr_pdsm_smart_inject' [Aneesh] > --- > Documentation/ABI/testing/sysfs-bus-papr-pmem | 13 +++ > arch/powerpc/include/uapi/asm/papr_pdsm.h | 18 ++++ > arch/powerpc/platforms/pseries/papr_scm.c | 94 ++++++++++++++++++- > 3 files changed, 122 insertions(+), 3 deletions(-) > > diff --git a/Documentation/ABI/testing/sysfs-bus-papr-pmem > b/Documentation/ABI/testing/sysfs-bus-papr-pmem > index 95254cec92bf..8a0b2a7f7671 100644 > --- a/Documentation/ABI/testing/sysfs-bus-papr-pmem > +++ b/Documentation/ABI/testing/sysfs-bus-papr-pmem > @@ -61,3 +61,16 @@ Description: > * "CchRHCnt" : Cache Read Hit Count > * "CchWHCnt" : Cache Write Hit Count > * "FastWCnt" : Fast Write Count > + > +What: /sys/bus/nd/devices/nmemX/papr/health_bitmap_override > +Date: Jan, 2022 > +KernelVersion: v5.17 > +Contact: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>, > nvd...@lists.linux.dev, > +Description: > + (RO) Reports the health bitmap override/mask bitmaps that are > + applied to top bitmap received from PowerVM via the H_SCM_HEALTH > + Hcall. Together these can be used to forcibly set/reset specific > + bits returned from Hcall. These bitmaps are presently used to > + simulate various health or shutdown states for an nvdimm and are > + set by user-space tools like ndctl by issuing a PAPR DSM. > + > diff --git a/arch/powerpc/include/uapi/asm/papr_pdsm.h > b/arch/powerpc/include/uapi/asm/papr_pdsm.h > index 82488b1e7276..17439925045c 100644 > --- a/arch/powerpc/include/uapi/asm/papr_pdsm.h > +++ b/arch/powerpc/include/uapi/asm/papr_pdsm.h > @@ -116,6 +116,22 @@ struct nd_papr_pdsm_health { > }; > }; > > +/* Flags for injecting specific smart errors */ > +#define PDSM_SMART_INJECT_HEALTH_FATAL (1 << 0) > +#define PDSM_SMART_INJECT_BAD_SHUTDOWN (1 << 1) > + > +struct nd_papr_pdsm_smart_inject { > + union { > + struct { > + /* One or more of PDSM_SMART_INJECT_ */ > + __u32 flags; > + __u8 fatal_enable; > + __u8 unsafe_shutdown_enable; > + }; > + __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; > + }; > +}; > + > /* > * Methods to be embedded in ND_CMD_CALL request. These are sent to the > kernel > * via 'nd_cmd_pkg.nd_command' member of the ioctl struct > @@ -123,12 +139,14 @@ struct nd_papr_pdsm_health { > enum papr_pdsm { > PAPR_PDSM_MIN = 0x0, > PAPR_PDSM_HEALTH, > + PAPR_PDSM_SMART_INJECT, > PAPR_PDSM_MAX, > }; > > /* Maximal union that can hold all possible payload types */ > union nd_pdsm_payload { > struct nd_papr_pdsm_health health; > + struct nd_papr_pdsm_smart_inject smart_inject; > __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; > } __packed; > > diff --git a/arch/powerpc/platforms/pseries/papr_scm.c > b/arch/powerpc/platforms/pseries/papr_scm.c > index f48e87ac89c9..de4cf329cfb3 100644 > --- a/arch/powerpc/platforms/pseries/papr_scm.c > +++ b/arch/powerpc/platforms/pseries/papr_scm.c > @@ -68,6 +68,10 @@ > #define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS) > #define PAPR_SCM_PERF_STATS_VERSION 0x1 > > +/* Use bitblt method to override specific bits in the '_bitmap_' */ > +#define BITBLT_BITMAP(_bitmap_, _mask_, _override_) \ > + (((_bitmap_) & ~(_mask_)) | ((_mask_) & (_override_))) > + > /* Struct holding a single performance metric */ > struct papr_scm_perf_stat { > u8 stat_id[8]; > @@ -120,6 +124,12 @@ struct papr_scm_priv { > > /* length of the stat buffer as expected by phyp */ > size_t stat_buffer_len; > + > + /* The bits which needs to be overridden */ > + u64 health_bitmap_mask; > + > + /* The overridden values for the bits having the masks set */ > + u64 health_bitmap_override; > }; > > static int papr_scm_pmem_flush(struct nd_region *nd_region, > @@ -347,19 +357,28 @@ static ssize_t drc_pmem_query_stats(struct > papr_scm_priv *p, > static int __drc_pmem_query_health(struct papr_scm_priv *p) > { > unsigned long ret[PLPAR_HCALL_BUFSIZE]; > + u64 bitmap = 0; > long rc; > > /* issue the hcall */ > rc = plpar_hcall(H_SCM_HEALTH, ret, p->drc_index); > - if (rc != H_SUCCESS) { > + if (rc == H_SUCCESS) > + bitmap = ret[0] & ret[1]; > + else if (rc == H_FUNCTION) > + dev_info_once(&p->pdev->dev, > + "Hcall H_SCM_HEALTH not implemented, assuming > empty health bitmap"); > + else { > + > dev_err(&p->pdev->dev, > "Failed to query health information, Err:%ld\n", rc); > return -ENXIO; > } > > p->lasthealth_jiffies = jiffies; > - p->health_bitmap = ret[0] & ret[1]; > - > + /* Allow overriding specific health bits via bit blt. */ > + bitmap = BITBLT_BITMAP(bitmap, p->health_bitmap_mask, > + p->health_bitmap_override); > + WRITE_ONCE(p->health_bitmap, bitmap);
Why WRITE_ONCE ? > dev_dbg(&p->pdev->dev, > "Queried dimm health info. Bitmap:0x%016lx Mask:0x%016lx\n", > ret[0], ret[1]); > @@ -669,6 +688,54 @@ static int papr_pdsm_health(struct papr_scm_priv *p, > return rc; > } > > +/* Inject a smart error Add the dirty-shutdown-counter value to the pdsm */ > +static int papr_pdsm_smart_inject(struct papr_scm_priv *p, > + union nd_pdsm_payload *payload) > +{ > + int rc; > + u32 supported_flags = 0; > + u64 mask = 0, override = 0; > + > + /* Check for individual smart error flags and update mask and override > */ > + if (payload->smart_inject.flags & PDSM_SMART_INJECT_HEALTH_FATAL) { > + supported_flags |= PDSM_SMART_INJECT_HEALTH_FATAL; > + mask |= PAPR_PMEM_HEALTH_FATAL; > + override |= payload->smart_inject.fatal_enable ? > + PAPR_PMEM_HEALTH_FATAL : 0; > + } > + > + if (payload->smart_inject.flags & PDSM_SMART_INJECT_BAD_SHUTDOWN) { > + supported_flags |= PDSM_SMART_INJECT_BAD_SHUTDOWN; > + mask |= PAPR_PMEM_SHUTDOWN_DIRTY; > + override |= payload->smart_inject.unsafe_shutdown_enable ? > + PAPR_PMEM_SHUTDOWN_DIRTY : 0; > + } > + > + dev_dbg(&p->pdev->dev, "[Smart-inject] Mask=%#llx override=%#llx\n", > + mask, override); > + > + /* Prevent concurrent access to dimm health bitmap related members */ > + rc = mutex_lock_interruptible(&p->health_mutex); > + if (rc) > + return rc; > + > + /* Bitblt mask/override to corrosponding health_bitmap couterparts */ > + p->health_bitmap_mask = BITBLT_BITMAP(p->health_bitmap_mask, > + mask, override); is that correct? Should we do that mask & override ? Shouldn't this be p->health_bitmap_mask = BITBLT_BITMAP(p->health_bitmap_mask, mask, ~0x0UL); > + p->health_bitmap_override = BITBLT_BITMAP(p->health_bitmap_override, > + mask, override); > + > + /* Invalidate cached health bitmap */ > + p->lasthealth_jiffies = 0; > + > + mutex_unlock(&p->health_mutex); > + > + /* Return the supported flags back to userspace */ > + payload->smart_inject.flags = supported_flags; > + > + return sizeof(struct nd_papr_pdsm_health); > +} > + > /* > * 'struct pdsm_cmd_desc' > * Identifies supported PDSMs' expected length of in/out payloads > @@ -702,6 +769,12 @@ static const struct pdsm_cmd_desc > __pdsm_cmd_descriptors[] = { > .size_out = sizeof(struct nd_papr_pdsm_health), > .service = papr_pdsm_health, > }, > + > + [PAPR_PDSM_SMART_INJECT] = { > + .size_in = sizeof(struct nd_papr_pdsm_smart_inject), > + .size_out = sizeof(struct nd_papr_pdsm_smart_inject), > + .service = papr_pdsm_smart_inject, > + }, > /* Empty */ > [PAPR_PDSM_MAX] = { > .size_in = 0, > @@ -838,6 +911,20 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor > *nd_desc, > return 0; > } > > +static ssize_t health_bitmap_override_show(struct device *dev, > + struct device_attribute *attr, > + char *buf) > +{ > + struct nvdimm *dimm = to_nvdimm(dev); > + struct papr_scm_priv *p = nvdimm_provider_data(dimm); > + > + return sprintf(buf, "mask=%#llx override=%#llx\n", > + READ_ONCE(p->health_bitmap_mask), > + READ_ONCE(p->health_bitmap_override)); > +} > + > +static DEVICE_ATTR_ADMIN_RO(health_bitmap_override); > + > static ssize_t perf_stats_show(struct device *dev, > struct device_attribute *attr, char *buf) > { > @@ -952,6 +1039,7 @@ static struct attribute *papr_nd_attributes[] = { > &dev_attr_flags.attr, > &dev_attr_perf_stats.attr, > &dev_attr_dirty_shutdown.attr, > + &dev_attr_health_bitmap_override.attr, > NULL, > }; > > -- > 2.34.1