[PATCH v4 4/4] nfit: do an ARS scrub on hitting a latent media error

2016-07-23 Thread Dan Williams
From: Vishal Verma 

When a latent (unknown to 'badblocks') error is encountered, it will
trigger a machine check exception. On a system with machine check
recovery, this will only SIGBUS the process(es) which had the bad page
mapped (as opposed to a kernel panic on platforms without machine
check recovery features). In the former case, we want to trigger a full
rescan of that nvdimm bus. This will allow any additional, new errors
to be captured in the block devices' badblocks lists, and offending
operations on them can be trapped early, avoiding machine checks.

This is done by registering a callback function with the
x86_mce_decoder_chain and calling the new ars_rescan functionality with
the address in the mce notificatiion.

Cc: Rafael J. Wysocki 
Cc: Tony Luck 
Signed-off-by: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/acpi/nfit/Makefile  |1 
 drivers/acpi/nfit/core.c|   26 +++--
 drivers/acpi/nfit/mce.c |   89 +++
 drivers/acpi/nfit/nfit.h|   20 ++
 tools/testing/nvdimm/Kbuild |1 
 5 files changed, 133 insertions(+), 4 deletions(-)
 create mode 100644 drivers/acpi/nfit/mce.c

diff --git a/drivers/acpi/nfit/Makefile b/drivers/acpi/nfit/Makefile
index eb95c5aff83b..a407e769f103 100644
--- a/drivers/acpi/nfit/Makefile
+++ b/drivers/acpi/nfit/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_ACPI_NFIT) := nfit.o
 nfit-y := core.o
+nfit-$(CONFIG_X86_MCE) += mce.o
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 2eaa03dc185a..686837e8358f 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -51,6 +51,9 @@ module_param(disable_vendor_specific, bool, S_IRUGO);
 MODULE_PARM_DESC(disable_vendor_specific,
"Limit commands to the publicly specified set\n");
 
+LIST_HEAD(acpi_descs);
+DEFINE_MUTEX(acpi_desc_lock);
+
 static struct workqueue_struct *nfit_wq;
 
 struct nfit_table_prev {
@@ -361,7 +364,7 @@ static const char *spa_type_name(u16 type)
return to_name[type];
 }
 
-static int nfit_spa_type(struct acpi_nfit_system_address *spa)
+int nfit_spa_type(struct acpi_nfit_system_address *spa)
 {
int i;
 
@@ -898,8 +901,6 @@ static ssize_t scrub_show(struct device *dev,
return rc;
 }
 
-static int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc);
-
 static ssize_t scrub_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t size)
 {
@@ -2400,6 +2401,11 @@ static void acpi_nfit_destruct(void *data)
struct acpi_nfit_desc *acpi_desc = data;
struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
 
+   /*
+* Destruct under acpi_desc_lock so that nfit_handle_mce does not
+* race teardown
+*/
+   mutex_lock(_desc_lock);
acpi_desc->cancel = 1;
/*
 * Bounce the nvdimm bus lock to make sure any in-flight
@@ -2414,6 +2420,8 @@ static void acpi_nfit_destruct(void *data)
sysfs_put(acpi_desc->scrub_count_state);
nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
acpi_desc->nvdimm_bus = NULL;
+   list_del(_desc->list);
+   mutex_unlock(_desc_lock);
 }
 
 int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz)
@@ -2441,6 +2449,11 @@ int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, 
void *data, acpi_size sz)
rc = acpi_nfit_desc_init_scrub_attr(acpi_desc);
if (rc)
return rc;
+
+   /* register this acpi_desc for mce notifications */
+   mutex_lock(_desc_lock);
+   list_add_tail(_desc->list, _descs);
+   mutex_unlock(_desc_lock);
}
 
rc = acpi_nfit_desc_init_scrub_attr(acpi_desc);
@@ -2555,7 +2568,7 @@ static int acpi_nfit_clear_to_send(struct 
nvdimm_bus_descriptor *nd_desc,
return 0;
 }
 
-static int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc)
+int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc)
 {
struct device *dev = acpi_desc->dev;
struct nfit_spa *nfit_spa;
@@ -2604,6 +2617,7 @@ void acpi_nfit_desc_init(struct acpi_nfit_desc 
*acpi_desc, struct device *dev)
INIT_LIST_HEAD(_desc->flushes);
INIT_LIST_HEAD(_desc->memdevs);
INIT_LIST_HEAD(_desc->dimms);
+   INIT_LIST_HEAD(_desc->list);
mutex_init(_desc->init_mutex);
INIT_WORK(_desc->work, acpi_nfit_scrub);
 }
@@ -2756,13 +2770,17 @@ static __init int nfit_init(void)
if (!nfit_wq)
return -ENOMEM;
 
+   nfit_mce_register();
+
return acpi_bus_register_driver(_nfit_driver);
 }
 
 static __exit void nfit_exit(void)
 {
+   nfit_mce_unregister();
acpi_bus_unregister_driver(_nfit_driver);
destroy_workqueue(nfit_wq);
+   WARN_ON(!list_empty(_descs));
 }
 
 

[PATCH v4 2/4] nfit, libnvdimm: allow an ARS scrub to be triggered on demand

2016-07-23 Thread Dan Williams
From: Vishal Verma 

Normally, an ARS (Address Range Scrub) only happens at
boot/initialization time. There can however arise situations where a
bus-wide rescan is needed - notably, in the case of discovering a latent
media error, we should do a full rescan to figure out what other sectors
are bad, and thus potentially avoid triggering an mce on them in the
future. Also provide a sysfs trigger to start a bus-wide scrub.

Cc: Rafael J. Wysocki 
Signed-off-by: Vishal Verma 
Signed-off-by: Dan Williams 
---
 drivers/acpi/nfit.c   |  167 +++--
 drivers/acpi/nfit.h   |4 +
 drivers/nvdimm/core.c |7 ++
 include/linux/libnvdimm.h |1 
 4 files changed, 171 insertions(+), 8 deletions(-)

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index be7c2fde16e7..2eaa03dc185a 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -874,14 +875,87 @@ static ssize_t revision_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(revision);
 
+/*
+ * This shows the number of full Address Range Scrubs that have been
+ * completed since driver load time. Userspace can wait on this using
+ * select/poll etc. A '+' at the end indicates an ARS is in progress
+ */
+static ssize_t scrub_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct nvdimm_bus_descriptor *nd_desc;
+   ssize_t rc = -ENXIO;
+
+   device_lock(dev);
+   nd_desc = dev_get_drvdata(dev);
+   if (nd_desc) {
+   struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+
+   rc = sprintf(buf, "%d%s", acpi_desc->scrub_count,
+   (work_busy(_desc->work)) ? "+\n" : "\n");
+   }
+   device_unlock(dev);
+   return rc;
+}
+
+static int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc);
+
+static ssize_t scrub_store(struct device *dev,
+   struct device_attribute *attr, const char *buf, size_t size)
+{
+   struct nvdimm_bus_descriptor *nd_desc;
+   ssize_t rc;
+   long val;
+
+   rc = kstrtol(buf, 0, );
+   if (rc)
+   return rc;
+   if (val != 1)
+   return -EINVAL;
+
+   device_lock(dev);
+   nd_desc = dev_get_drvdata(dev);
+   if (nd_desc) {
+   struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+
+   rc = acpi_nfit_ars_rescan(acpi_desc);
+   }
+   device_unlock(dev);
+   if (rc)
+   return rc;
+   return size;
+}
+static DEVICE_ATTR_RW(scrub);
+
+static bool ars_supported(struct nvdimm_bus *nvdimm_bus)
+{
+   struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+   const unsigned long mask = 1 << ND_CMD_ARS_CAP | 1 << ND_CMD_ARS_START
+   | 1 << ND_CMD_ARS_STATUS;
+
+   return (nd_desc->cmd_mask & mask) == mask;
+}
+
+static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+   struct device *dev = container_of(kobj, struct device, kobj);
+   struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+
+   if (a == _attr_scrub.attr && !ars_supported(nvdimm_bus))
+   return 0;
+   return a->mode;
+}
+
 static struct attribute *acpi_nfit_attributes[] = {
_attr_revision.attr,
+   _attr_scrub.attr,
NULL,
 };
 
 static struct attribute_group acpi_nfit_attribute_group = {
.name = "nfit",
.attrs = acpi_nfit_attributes,
+   .is_visible = nfit_visible,
 };
 
 static const struct attribute_group *acpi_nfit_attribute_groups[] = {
@@ -2054,7 +2128,7 @@ static void acpi_nfit_async_scrub(struct acpi_nfit_desc 
*acpi_desc,
unsigned int tmo = scrub_timeout;
int rc;
 
-   if (nfit_spa->ars_done || !nfit_spa->nd_region)
+   if (!nfit_spa->ars_required || !nfit_spa->nd_region)
return;
 
rc = ars_start(acpi_desc, nfit_spa);
@@ -2143,7 +2217,9 @@ static void acpi_nfit_scrub(struct work_struct *work)
 * firmware initiated scrubs to complete and then we go search for the
 * affected spa regions to mark them scanned.  In the second phase we
 * initiate a directed scrub for every range that was not scrubbed in
-* phase 1.
+* phase 1. If we're called for a 'rescan', we harmlessly pass through
+* the first phase, but really only care about running phase 2, where
+* regions can be notified of new poison.
 */
 
/* process platform firmware initiated scrubs */
@@ -2246,14 +2322,17 @@ static void acpi_nfit_scrub(struct work_struct *work)
 * Flag all the ranges that still need scrubbing, but
 * register them now to make data available.
 */
-   if (nfit_spa->nd_region)
-  

Re: [PATCH v2 16/17] x86/insn: remove pcommit

2016-07-23 Thread Ingo Molnar

* Dan Williams  wrote:

> On Fri, Jul 22, 2016 at 9:52 AM, Ingo Molnar  wrote:
> >
> > * Dan Williams  wrote:
> >
> >> On Tue, Jul 12, 2016 at 3:12 PM, Dan Williams  
> >> wrote:
> >> > On Tue, Jul 12, 2016 at 7:57 AM, Peter Zijlstra  
> >> > wrote:
> >> >> On Sat, Jul 09, 2016 at 08:25:54PM -0700, Dan Williams wrote:
> >> >>> The pcommit instruction is being deprecated in favor of either ADR
> >> >>> (asynchronous DRAM refresh: flush-on-power-fail) at the platform 
> >> >>> level, or
> >> >>> posted-write-queue flush addresses as defined by the ACPI 6.x NFIT 
> >> >>> (NVDIMM
> >> >>> Firmware Interface Table).
> >> >>
> >> >>>  arch/x86/include/asm/cpufeatures.h |1
> >> >>>  arch/x86/include/asm/special_insns.h   |   46 
> >> >>> 
> >> >>>  arch/x86/lib/x86-opcode-map.txt|2 -
> >> >>>  tools/objtool/arch/x86/insn/x86-opcode-map.txt |2 -
> >> >>>  tools/perf/arch/x86/tests/insn-x86-dat-32.c|2 -
> >> >>>  tools/perf/arch/x86/tests/insn-x86-dat-64.c|2 -
> >> >>>  tools/perf/arch/x86/tests/insn-x86-dat-src.c   |4 --
> >> >>
> >> >> Just deprecated, or is it completely eradicated, removed from history,
> >> >> will never ever happen and we'll reissue the opcode for something else?
> >> >>
> >> >> Because if its only deprecated then removing it from the instruction
> >> >> decoders seems wrong, old binaries might still contain the opcode.
> >> >
> >> > Eradicated.
> >> >
> >> > "The new instructions like CLWB and CLFLUSHOPT will be rolled into the
> >> > SDM but PCOMMIT will be removed from the Extensions doc and not rolled
> >> > into the SDM." [1]
> >> >
> >> > Existing binaries are already gating their usage on the presence of
> >> > the cpu id flag, that flag and the instruction opcode are reserved
> >> > going forward.
> >> >
> >> > [1]: https://lists.01.org/pipermail/linux-nvdimm/2016-June/005923.html
> >>
> >> x86 maintainers, I have the other patches in this series queued in -next. 
> >> Please
> >> ack this one and I'll add it for v4.8-rc1, or otherwise let me know how 
> >> you want
> >> to handle this patch.
> >
> > Since it's just a removal AFAICS that the rest of your series should not 
> > depend
> > on, can you submit it to the x86 tree?
> 
> This patch depends on the previous patches in the series removing
> calls to pcommit_sfence().

Ok, and the patch looks harmless:

Acked-by: Ingo Molnar 

Thanks,

Ingo
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm