Re: [PATCH v2 3/3] nfit: do an ARS scrub on hitting a latent media error

2016-07-21 Thread Linda Knippers


On 7/20/2016 9:50 PM, Vishal Verma wrote:
> When a latent (unknown to 'badblocks') error is encountered, it will
> trigger a machine check exception. On a system with machine check
> recovery, this will only SIGBUS the process(es) which had the bad page
> mapped (as opposed to a kernel panic on platforms without machine
> check recovery features). In the former case, we want to trigger a full
> rescan of that nvdimm bus. This will allow any additional, new errors
> to be captured in the block devices' badblocks lists, and offending
> operations on them can be trapped early, avoiding machine checks.

Do we really need to rescan all SPA ranges?  If the problem is with
an NVDIMM, wouldn't the blast radius be the device or it's interleave set,
which would be part of the same SPA range?

I don't know what the overhead associated with a scan is which is why I'm 
asking.

-- ljk

> 
> This is done by registering a callback function with the
> x86_mce_decoder_chain and calling the new ars_rescan functionality with
> the address in the mce notificatiion.
> 
> Cc: Dan Williams 
> Cc: Rafael J. Wysocki 
> Cc: Tony Luck 
> Cc: 
> Cc: 
> Signed-off-by: Vishal Verma 
> ---
>  drivers/acpi/nfit.c | 89 
> +
>  drivers/acpi/nfit.h |  1 +
>  2 files changed, 90 insertions(+)
> 
> diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
> index 4e65255..c9f1ee4 100644
> --- a/drivers/acpi/nfit.c
> +++ b/drivers/acpi/nfit.c
> @@ -12,6 +12,7 @@
>   */
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -24,6 +25,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "nfit.h"
>  
>  /*
> @@ -51,6 +53,9 @@ module_param(disable_vendor_specific, bool, S_IRUGO);
>  MODULE_PARM_DESC(disable_vendor_specific,
>   "Limit commands to the publicly specified set\n");
>  
> +static LIST_HEAD(acpi_descs);
> +static DEFINE_MUTEX(acpi_desc_lock);
> +
>  static struct workqueue_struct *nfit_wq;
>  
>  struct nfit_table_prev {
> @@ -2416,9 +2421,11 @@ static int acpi_nfit_check_deletions(struct 
> acpi_nfit_desc *acpi_desc,
>  
>  int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
>  {
> + struct acpi_nfit_desc *acpi_desc_entry;
>   struct device *dev = acpi_desc->dev;
>   struct nfit_table_prev prev;
>   const void *end;
> + int found = 0;
>   u8 *data;
>   int rc;
>  
> @@ -2473,6 +2480,19 @@ int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, 
> acpi_size sz)
>  
>   rc = acpi_nfit_register_regions(acpi_desc);
>  
> + /*
> +  * We may get here due to an update of the nfit via _FIT.
> +  * Check if the acpi_desc we're (re)initializing is already
> +  * present in the list, and if so, don't re-add it
> +  */
> + mutex_lock(&acpi_desc_lock);
> + list_for_each_entry(acpi_desc_entry, &acpi_descs, list)
> + if (acpi_desc_entry == acpi_desc)
> + found = 1;
> + if (found == 0)
> + list_add_tail(&acpi_desc->list, &acpi_descs);
> + mutex_unlock(&acpi_desc_lock);
> +
>   out_unlock:
>   mutex_unlock(&acpi_desc->init_mutex);
>   return rc;
> @@ -2555,6 +2575,65 @@ static int acpi_nfit_ars_rescan(struct acpi_nfit_desc 
> *acpi_desc)
>   return 0;
>  }
>  
> +static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
> + void *data)
> +{
> + struct mce *mce = (struct mce *)data;
> + struct acpi_nfit_desc *acpi_desc;
> + struct nfit_spa *nfit_spa;
> +
> + /* We only care about memory errors */
> + if (!(mce->status & MCACOD))
> + return NOTIFY_DONE;
> +
> + /*
> +  * mce->addr contains the physical addr accessed that caused the
> +  * machine check. We need to walk through the list of NFITs, and see
> +  * if any of them matches that address, and only then start a scrub.
> +  */
> + mutex_lock(&acpi_desc_lock);
> + list_for_each_entry(acpi_desc, &acpi_descs, list) {
> + struct device *dev = acpi_desc->dev;
> + int found_match = 0;
> +
> + list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
> + struct acpi_nfit_system_address *spa = nfit_spa->spa;
> +
> + if (nfit_spa_type(spa) != NFIT_SPA_PM)
> + continue;
> + /* find the spa that covers the mce addr */
> + if (spa->address > mce->addr)
> + continue;
> + if ((spa->address + spa->length - 1) < mce->addr)
> + continue;
> + found_match = 1;
> + dev_dbg(dev, "%s: addr in SPA %d (0x%llx, 0x%llx)\n",
> + __func__, spa->range_index, spa->address,
> + spa->length);
> + /*
> +  * We can break at the first match because we're

Re: [PATCH v2 3/3] nfit: do an ARS scrub on hitting a latent media error

2016-07-21 Thread Linda Knippers


On 7/21/2016 5:10 PM, Vishal Verma wrote:
> On 07/21, Linda Knippers wrote:
>>
>>
>> On 7/20/2016 9:50 PM, Vishal Verma wrote:
>>> When a latent (unknown to 'badblocks') error is encountered, it will
>>> trigger a machine check exception. On a system with machine check
>>> recovery, this will only SIGBUS the process(es) which had the bad page
>>> mapped (as opposed to a kernel panic on platforms without machine
>>> check recovery features). In the former case, we want to trigger a full
>>> rescan of that nvdimm bus. This will allow any additional, new errors
>>> to be captured in the block devices' badblocks lists, and offending
>>> operations on them can be trapped early, avoiding machine checks.
>>
>> Do we really need to rescan all SPA ranges?  If the problem is with
>> an NVDIMM, wouldn't the blast radius be the device or it's interleave set,
>> which would be part of the same SPA range?
>>
>> I don't know what the overhead associated with a scan is which is why I'm 
>> asking.
> 
> You're right that we don't _need_ to scan all ranges, and that the scrub
> can be long-running, but we just take this 'event' as an opportunity to
> basically refresh everything. Since it is asynchronous, we're not
> holding anything up.

We're not holding up anything in the kernel but I assume there it's
not a zero-overhead operation.  The memory controller may be doing something
or the platform firmware could be doing something which could introduce
latency spikes.  It's the kind of thing that really annoys some customers
but maybe following an MCE no one cares about that.

-- ljk
> 
>>
>> -- ljk
>>
>>>
>>> This is done by registering a callback function with the
>>> x86_mce_decoder_chain and calling the new ars_rescan functionality with
>>> the address in the mce notificatiion.
>>>
>>> Cc: Dan Williams 
>>> Cc: Rafael J. Wysocki 
>>> Cc: Tony Luck 
>>> Cc: 
>>> Cc: 
>>> Signed-off-by: Vishal Verma 
>>> ---
>>>  drivers/acpi/nfit.c | 89 
>>> +
>>>  drivers/acpi/nfit.h |  1 +
>>>  2 files changed, 90 insertions(+)
>>>
>>> diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
>>> index 4e65255..c9f1ee4 100644
>>> --- a/drivers/acpi/nfit.c
>>> +++ b/drivers/acpi/nfit.c
>>> @@ -12,6 +12,7 @@
>>>   */
>>>  #include 
>>>  #include 
>>> +#include 
>>>  #include 
>>>  #include 
>>>  #include 
>>> @@ -24,6 +25,7 @@
>>>  #include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>  #include "nfit.h"
>>>  
>>>  /*
>>> @@ -51,6 +53,9 @@ module_param(disable_vendor_specific, bool, S_IRUGO);
>>>  MODULE_PARM_DESC(disable_vendor_specific,
>>> "Limit commands to the publicly specified set\n");
>>>  
>>> +static LIST_HEAD(acpi_descs);
>>> +static DEFINE_MUTEX(acpi_desc_lock);
>>> +
>>>  static struct workqueue_struct *nfit_wq;
>>>  
>>>  struct nfit_table_prev {
>>> @@ -2416,9 +2421,11 @@ static int acpi_nfit_check_deletions(struct 
>>> acpi_nfit_desc *acpi_desc,
>>>  
>>>  int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
>>>  {
>>> +   struct acpi_nfit_desc *acpi_desc_entry;
>>> struct device *dev = acpi_desc->dev;
>>> struct nfit_table_prev prev;
>>> const void *end;
>>> +   int found = 0;
>>> u8 *data;
>>> int rc;
>>>  
>>> @@ -2473,6 +2480,19 @@ int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, 
>>> acpi_size sz)
>>>  
>>> rc = acpi_nfit_register_regions(acpi_desc);
>>>  
>>> +   /*
>>> +* We may get here due to an update of the nfit via _FIT.
>>> +* Check if the acpi_desc we're (re)initializing is already
>>> +* present in the list, and if so, don't re-add it
>>> +*/
>>> +   mutex_lock(&acpi_desc_lock);
>>> +   list_for_each_entry(acpi_desc_entry, &acpi_descs, list)
>>> +   if (acpi_desc_entry == acpi_desc)
>>> +   found = 1;
>>> +   if (found == 0)
>>> +   list_add_tail(&acpi_desc->list, &acpi_descs);
>>> +   mutex_unlock(&acpi_desc_lock);
>>> +
>>>   out_unlock:
>>> mutex_unlock(&acpi_desc->init_mutex);
>>> return rc;
>>> @@ -2555,6 +2575,65 @@ static int acpi_nfit_ars_rescan(struct 
>>> acpi_nfit_desc *acpi_desc)
>>> return 0;
>>>  }
>>>  
>>> +static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
>>> +   void *data)
>>> +{
>>> +   struct mce *mce = (struct mce *)data;
>>> +   struct acpi_nfit_desc *acpi_desc;
>>> +   struct nfit_spa *nfit_spa;
>>> +
>>> +   /* We only care about memory errors */
>>> +   if (!(mce->status & MCACOD))
>>> +   return NOTIFY_DONE;
>>> +
>>> +   /*
>>> +* mce->addr contains the physical addr accessed that caused the
>>> +* machine check. We need to walk through the list of NFITs, and see
>>> +* if any of them matches that address, and only then start a scrub.
>>> +*/
>>> +   mutex_lock(&acpi_desc_lock);
>>> +   list_for_each_entry(acpi_desc, &acpi_descs, list) {
>>> +   struct device *dev = acpi_desc->dev;
>>> +   int found_match = 0;
>>> +
>>> +   list_

Re: [PATCH v2 3/3] nfit: do an ARS scrub on hitting a latent media error

2016-07-21 Thread Vishal Verma
On 07/21, Linda Knippers wrote:
> 
> 
> On 7/20/2016 9:50 PM, Vishal Verma wrote:
> > When a latent (unknown to 'badblocks') error is encountered, it will
> > trigger a machine check exception. On a system with machine check
> > recovery, this will only SIGBUS the process(es) which had the bad page
> > mapped (as opposed to a kernel panic on platforms without machine
> > check recovery features). In the former case, we want to trigger a full
> > rescan of that nvdimm bus. This will allow any additional, new errors
> > to be captured in the block devices' badblocks lists, and offending
> > operations on them can be trapped early, avoiding machine checks.
> 
> Do we really need to rescan all SPA ranges?  If the problem is with
> an NVDIMM, wouldn't the blast radius be the device or it's interleave set,
> which would be part of the same SPA range?
> 
> I don't know what the overhead associated with a scan is which is why I'm 
> asking.

You're right that we don't _need_ to scan all ranges, and that the scrub
can be long-running, but we just take this 'event' as an opportunity to
basically refresh everything. Since it is asynchronous, we're not
holding anything up.

> 
> -- ljk
> 
> > 
> > This is done by registering a callback function with the
> > x86_mce_decoder_chain and calling the new ars_rescan functionality with
> > the address in the mce notificatiion.
> > 
> > Cc: Dan Williams 
> > Cc: Rafael J. Wysocki 
> > Cc: Tony Luck 
> > Cc: 
> > Cc: 
> > Signed-off-by: Vishal Verma 
> > ---
> >  drivers/acpi/nfit.c | 89 
> > +
> >  drivers/acpi/nfit.h |  1 +
> >  2 files changed, 90 insertions(+)
> > 
> > diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
> > index 4e65255..c9f1ee4 100644
> > --- a/drivers/acpi/nfit.c
> > +++ b/drivers/acpi/nfit.c
> > @@ -12,6 +12,7 @@
> >   */
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -24,6 +25,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include "nfit.h"
> >  
> >  /*
> > @@ -51,6 +53,9 @@ module_param(disable_vendor_specific, bool, S_IRUGO);
> >  MODULE_PARM_DESC(disable_vendor_specific,
> > "Limit commands to the publicly specified set\n");
> >  
> > +static LIST_HEAD(acpi_descs);
> > +static DEFINE_MUTEX(acpi_desc_lock);
> > +
> >  static struct workqueue_struct *nfit_wq;
> >  
> >  struct nfit_table_prev {
> > @@ -2416,9 +2421,11 @@ static int acpi_nfit_check_deletions(struct 
> > acpi_nfit_desc *acpi_desc,
> >  
> >  int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
> >  {
> > +   struct acpi_nfit_desc *acpi_desc_entry;
> > struct device *dev = acpi_desc->dev;
> > struct nfit_table_prev prev;
> > const void *end;
> > +   int found = 0;
> > u8 *data;
> > int rc;
> >  
> > @@ -2473,6 +2480,19 @@ int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, 
> > acpi_size sz)
> >  
> > rc = acpi_nfit_register_regions(acpi_desc);
> >  
> > +   /*
> > +* We may get here due to an update of the nfit via _FIT.
> > +* Check if the acpi_desc we're (re)initializing is already
> > +* present in the list, and if so, don't re-add it
> > +*/
> > +   mutex_lock(&acpi_desc_lock);
> > +   list_for_each_entry(acpi_desc_entry, &acpi_descs, list)
> > +   if (acpi_desc_entry == acpi_desc)
> > +   found = 1;
> > +   if (found == 0)
> > +   list_add_tail(&acpi_desc->list, &acpi_descs);
> > +   mutex_unlock(&acpi_desc_lock);
> > +
> >   out_unlock:
> > mutex_unlock(&acpi_desc->init_mutex);
> > return rc;
> > @@ -2555,6 +2575,65 @@ static int acpi_nfit_ars_rescan(struct 
> > acpi_nfit_desc *acpi_desc)
> > return 0;
> >  }
> >  
> > +static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
> > +   void *data)
> > +{
> > +   struct mce *mce = (struct mce *)data;
> > +   struct acpi_nfit_desc *acpi_desc;
> > +   struct nfit_spa *nfit_spa;
> > +
> > +   /* We only care about memory errors */
> > +   if (!(mce->status & MCACOD))
> > +   return NOTIFY_DONE;
> > +
> > +   /*
> > +* mce->addr contains the physical addr accessed that caused the
> > +* machine check. We need to walk through the list of NFITs, and see
> > +* if any of them matches that address, and only then start a scrub.
> > +*/
> > +   mutex_lock(&acpi_desc_lock);
> > +   list_for_each_entry(acpi_desc, &acpi_descs, list) {
> > +   struct device *dev = acpi_desc->dev;
> > +   int found_match = 0;
> > +
> > +   list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
> > +   struct acpi_nfit_system_address *spa = nfit_spa->spa;
> > +
> > +   if (nfit_spa_type(spa) != NFIT_SPA_PM)
> > +   continue;
> > +   /* find the spa that covers the mce addr */
> > +   if (spa->address > mce->addr)
> > +   continue;
> > +