Re: [PATCH v3] dax: enable dax fault handler to report VM_FAULT_HWPOISON

2023-05-08 Thread Jane Chu

On 5/4/2023 7:32 PM, Dan Williams wrote:

Jane Chu wrote:

When multiple processes mmap() a dax file, then at some point,
a process issues a 'load' and consumes a hwpoison, the process
receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb
set for the poison scope. Soon after, any other process issues
a 'load' to the poisoned page (that is unmapped from the kernel
side by memory_failure), it receives a SIGBUS with
si_code = BUS_ADRERR and without valid si_lsb.

This is confusing to user, and is different from page fault due
to poison in RAM memory, also some helpful information is lost.

Channel dax backend driver's poison detection to the filesystem
such that instead of reporting VM_FAULT_SIGBUS, it could report
VM_FAULT_HWPOISON.


I do think it is interesting that this will start returning SIGBUS with
BUS_MCEERR_AR for stores whereas it is only signalled for loads in the
direct consumption path, but I can't think of a scenario where that
would confuse existing software.


Yes indeed, nice catch, thank you!




Change from v2:
   Convert -EHWPOISON to -EIO to prevent EHWPOISON errno from leaking
out to block read(2) - suggested by Matthew.


For next time these kind of changelog notes belong...


Signed-off-by: Jane Chu 
---


...here after the "---".


I'll move the change log to a cover letter.




  drivers/nvdimm/pmem.c | 2 +-
  fs/dax.c  | 4 ++--
  include/linux/mm.h| 2 ++
  3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index ceea55f621cc..46e094e56159 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, 
pgoff_t pgoff,
long actual_nr;
  
  		if (mode != DAX_RECOVERY_WRITE)

-   return -EIO;
+   return -EHWPOISON;
  
  		/*

 * Set the recovery stride is set to kernel page size because
diff --git a/fs/dax.c b/fs/dax.c
index 2ababb89918d..18f9598951f1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1498,7 +1498,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter 
*iomi,
  
  		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),

DAX_ACCESS, &kaddr, NULL);
-   if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
+   if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
map_len = dax_direct_access(dax_dev, pgoff,
PHYS_PFN(size), DAX_RECOVERY_WRITE,
&kaddr, NULL);
@@ -1506,7 +1506,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter 
*iomi,
recovery = true;
}
if (map_len < 0) {
-   ret = map_len;
+   ret = (map_len == -EHWPOISON) ? -EIO : map_len;


This fixup leaves out several other places where EHWPOISON could leak as
the errno for read(2)/write(2). I think I want to see something like
this:

diff --git a/fs/dax.c b/fs/dax.c
index 2ababb89918d..ec17f9977bcb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1077,14 +1077,13 @@ int dax_writeback_mapping_range(struct address_space 
*mapping,
  }
  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
  
-static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,

-   size_t size, void **kaddr, pfn_t *pfnp)
+static int __dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
+size_t size, void **kaddr, pfn_t *pfnp)
  {
 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
-   int id, rc = 0;
 long length;
+   int rc = 0;
  
-   id = dax_read_lock();

 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
DAX_ACCESS, kaddr, pfnp);
 if (length < 0) {
@@ -1113,6 +1112,36 @@ static int dax_iomap_direct_access(const struct iomap 
*iomap, loff_t pos,
 return rc;
  }
  
+static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,

+  size_t size, void **kaddr, pfn_t *pfnp)
+{
+
+   int id;
+
+   id = dax_read_lock();
+   rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
+   dax_read_unlock(id);
+
+   /* don't leak a memory access error code to I/O syscalls */
+   if (rc == -EHWPOISON)
+   return -EIO;
+   return rc;
+}
+
+static int dax_fault_direct_access(const struct iomap *iomap, loff_t pos,
+  size_t size, void **kaddr, pfn_t *pfnp)
+{
+
+   int id;
+
+   id = dax_read_lock();
+   rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
+   dax_read_unlock(id);
+
+   /* -EHWPOISON return ok */
+   return rc;
+}
+
  /**
   * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
   * by copying the data before and 

RE: [PATCH v3] dax: enable dax fault handler to report VM_FAULT_HWPOISON

2023-05-04 Thread Dan Williams
Jane Chu wrote:
> When multiple processes mmap() a dax file, then at some point,
> a process issues a 'load' and consumes a hwpoison, the process
> receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb
> set for the poison scope. Soon after, any other process issues
> a 'load' to the poisoned page (that is unmapped from the kernel
> side by memory_failure), it receives a SIGBUS with
> si_code = BUS_ADRERR and without valid si_lsb.
> 
> This is confusing to user, and is different from page fault due
> to poison in RAM memory, also some helpful information is lost.
> 
> Channel dax backend driver's poison detection to the filesystem
> such that instead of reporting VM_FAULT_SIGBUS, it could report
> VM_FAULT_HWPOISON.

I do think it is interesting that this will start returning SIGBUS with
BUS_MCEERR_AR for stores whereas it is only signalled for loads in the
direct consumption path, but I can't think of a scenario where that
would confuse existing software.

> Change from v2:
>   Convert -EHWPOISON to -EIO to prevent EHWPOISON errno from leaking
> out to block read(2) - suggested by Matthew.

For next time these kind of changelog notes belong...

> Signed-off-by: Jane Chu 
> ---

...here after the "---".

>  drivers/nvdimm/pmem.c | 2 +-
>  fs/dax.c  | 4 ++--
>  include/linux/mm.h| 2 ++
>  3 files changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index ceea55f621cc..46e094e56159 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device 
> *pmem, pgoff_t pgoff,
>   long actual_nr;
>  
>   if (mode != DAX_RECOVERY_WRITE)
> - return -EIO;
> + return -EHWPOISON;
>  
>   /*
>* Set the recovery stride is set to kernel page size because
> diff --git a/fs/dax.c b/fs/dax.c
> index 2ababb89918d..18f9598951f1 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1498,7 +1498,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter 
> *iomi,
>  
>   map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
>   DAX_ACCESS, &kaddr, NULL);
> - if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
> + if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
>   map_len = dax_direct_access(dax_dev, pgoff,
>   PHYS_PFN(size), DAX_RECOVERY_WRITE,
>   &kaddr, NULL);
> @@ -1506,7 +1506,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter 
> *iomi,
>   recovery = true;
>   }
>   if (map_len < 0) {
> - ret = map_len;
> + ret = (map_len == -EHWPOISON) ? -EIO : map_len;

This fixup leaves out several other places where EHWPOISON could leak as
the errno for read(2)/write(2). I think I want to see something like
this:

diff --git a/fs/dax.c b/fs/dax.c
index 2ababb89918d..ec17f9977bcb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1077,14 +1077,13 @@ int dax_writeback_mapping_range(struct address_space 
*mapping,
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
-   size_t size, void **kaddr, pfn_t *pfnp)
+static int __dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
+size_t size, void **kaddr, pfn_t *pfnp)
 {
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
-   int id, rc = 0;
long length;
+   int rc = 0;
 
-   id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
   DAX_ACCESS, kaddr, pfnp);
if (length < 0) {
@@ -1113,6 +1112,36 @@ static int dax_iomap_direct_access(const struct iomap 
*iomap, loff_t pos,
return rc;
 }
 
+static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
+  size_t size, void **kaddr, pfn_t *pfnp)
+{
+
+   int id;
+
+   id = dax_read_lock();
+   rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
+   dax_read_unlock(id);
+
+   /* don't leak a memory access error code to I/O syscalls */
+   if (rc == -EHWPOISON)
+   return -EIO;
+   return rc;
+}
+
+static int dax_fault_direct_access(const struct iomap *iomap, loff_t pos,
+  size_t size, void **kaddr, pfn_t *pfnp)
+{
+
+   int id;
+
+   id = dax_read_lock();
+   rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
+   dax_read_unlock(id);
+
+   /* -EHWPOISON return ok */
+   return rc;
+}
+
 /**
  * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
  * by copying the data before and after the range to be written.
@@ -1682,7 +1711,7 

[PATCH v3] dax: enable dax fault handler to report VM_FAULT_HWPOISON

2023-05-04 Thread Jane Chu
When multiple processes mmap() a dax file, then at some point,
a process issues a 'load' and consumes a hwpoison, the process
receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb
set for the poison scope. Soon after, any other process issues
a 'load' to the poisoned page (that is unmapped from the kernel
side by memory_failure), it receives a SIGBUS with
si_code = BUS_ADRERR and without valid si_lsb.

This is confusing to user, and is different from page fault due
to poison in RAM memory, also some helpful information is lost.

Channel dax backend driver's poison detection to the filesystem
such that instead of reporting VM_FAULT_SIGBUS, it could report
VM_FAULT_HWPOISON.

Change from v2:
  Convert -EHWPOISON to -EIO to prevent EHWPOISON errno from leaking
out to block read(2) - suggested by Matthew.

Signed-off-by: Jane Chu 
---
 drivers/nvdimm/pmem.c | 2 +-
 fs/dax.c  | 4 ++--
 include/linux/mm.h| 2 ++
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index ceea55f621cc..46e094e56159 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, 
pgoff_t pgoff,
long actual_nr;
 
if (mode != DAX_RECOVERY_WRITE)
-   return -EIO;
+   return -EHWPOISON;
 
/*
 * Set the recovery stride is set to kernel page size because
diff --git a/fs/dax.c b/fs/dax.c
index 2ababb89918d..18f9598951f1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1498,7 +1498,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter 
*iomi,
 
map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
DAX_ACCESS, &kaddr, NULL);
-   if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
+   if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
map_len = dax_direct_access(dax_dev, pgoff,
PHYS_PFN(size), DAX_RECOVERY_WRITE,
&kaddr, NULL);
@@ -1506,7 +1506,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter 
*iomi,
recovery = true;
}
if (map_len < 0) {
-   ret = map_len;
+   ret = (map_len == -EHWPOISON) ? -EIO : map_len;
break;
}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1f79667824eb..e4c974587659 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3217,6 +3217,8 @@ static inline vm_fault_t vmf_error(int err)
 {
if (err == -ENOMEM)
return VM_FAULT_OOM;
+   else if (err == -EHWPOISON)
+   return VM_FAULT_HWPOISON;
return VM_FAULT_SIGBUS;
 }
 
-- 
2.18.4