[PATCH v4 1/1] dax: enable dax fault handler to report VM_FAULT_HWPOISON

2023-05-08 Thread Jane Chu
When multiple processes mmap() a dax file, then at some point,
a process issues a 'load' and consumes a hwpoison, the process
receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb
set for the poison scope. Soon after, any other process issues
a 'load' to the poisoned page (that is unmapped from the kernel
side by memory_failure), it receives a SIGBUS with
si_code = BUS_ADRERR and without valid si_lsb.

This is confusing to user, and is different from page fault due
to poison in RAM memory, also some helpful information is lost.

Channel dax backend driver's poison detection to the filesystem
such that instead of reporting VM_FAULT_SIGBUS, it could report
VM_FAULT_HWPOISON.

If user level block IO syscalls fail due to poison, the errno will
be converted to EIO to maintain block API consistency.

Signed-off-by: Jane Chu 
---
 drivers/dax/super.c  |  5 -
 drivers/nvdimm/pmem.c|  2 +-
 drivers/s390/block/dcssblk.c |  3 ++-
 fs/dax.c | 11 ++-
 fs/fuse/virtio_fs.c  |  3 ++-
 include/linux/dax.h  |  5 +
 include/linux/mm.h   |  2 ++
 7 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index c4c4728a36e4..0da9232ea175 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -203,6 +203,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t 
pgoff, void *addr,
 int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
size_t nr_pages)
 {
+   int ret;
+
if (!dax_alive(dax_dev))
return -ENXIO;
/*
@@ -213,7 +215,8 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t 
pgoff,
if (nr_pages != 1)
return -EIO;
 
-   return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages);
+   ret = dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages);
+   return dax_mem2blk_err(ret);
 }
 EXPORT_SYMBOL_GPL(dax_zero_page_range);
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index ceea55f621cc..46e094e56159 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, 
pgoff_t pgoff,
long actual_nr;
 
if (mode != DAX_RECOVERY_WRITE)
-   return -EIO;
+   return -EHWPOISON;
 
/*
 * Set the recovery stride is set to kernel page size because
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index c09f2e053bf8..ee47ac520cd4 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -54,7 +54,8 @@ static int dcssblk_dax_zero_page_range(struct dax_device 
*dax_dev,
rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS,
, NULL);
if (rc < 0)
-   return rc;
+   return dax_mem2blk_err(rc);
+
memset(kaddr, 0, nr_pages << PAGE_SHIFT);
dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
return 0;
diff --git a/fs/dax.c b/fs/dax.c
index 2ababb89918d..a26eb5abfdc0 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1148,7 +1148,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t 
length, size_t align_size,
if (!zero_edge) {
ret = dax_iomap_direct_access(srcmap, pos, size, , NULL);
if (ret)
-   return ret;
+   return dax_mem2blk_err(ret);
}
 
if (copy_all) {
@@ -1310,7 +1310,7 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
 
 out_unlock:
dax_read_unlock(id);
-   return ret;
+   return dax_mem2blk_err(ret);
 }
 
 int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
@@ -1342,7 +1342,8 @@ static int dax_memzero(struct iomap_iter *iter, loff_t 
pos, size_t size)
ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, ,
NULL);
if (ret < 0)
-   return ret;
+   return dax_mem2blk_err(ret);
+
memset(kaddr + offset, 0, size);
if (iomap->flags & IOMAP_F_SHARED)
ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
@@ -1498,7 +1499,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter 
*iomi,
 
map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
DAX_ACCESS, , NULL);
-   if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
+   if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
map_len = dax_direct_access(dax_dev, pgoff,
PHYS_PFN(size), DAX_RECOVERY_WRITE,
, NULL);
@@ -1506,7 +1507,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter 
*iomi,
recovery = true;
}
if 

[PATCH v4 0/1] dax: enable dax fault handler to report VM_FAULT_HWPOISON

2023-05-08 Thread Jane Chu
Change from v3:
Prevent leaking EHWPOISON to user level block IO calls such as
zero_range_range, and truncate.  Suggested by Dan.

Change from v2:
Convert EHWPOISON to EIO to prevent EHWPOISON errno from leaking
out to block read(2). Suggested by Matthew.

Jane Chu (1):
  dax: enable dax fault handler to report VM_FAULT_HWPOISON

 drivers/dax/super.c  |  5 -
 drivers/nvdimm/pmem.c|  2 +-
 drivers/s390/block/dcssblk.c |  3 ++-
 fs/dax.c | 11 ++-
 fs/fuse/virtio_fs.c  |  3 ++-
 include/linux/dax.h  |  5 +
 include/linux/mm.h   |  2 ++
 7 files changed, 22 insertions(+), 9 deletions(-)

-- 
2.18.4




Re: [PATCH v3] dax: enable dax fault handler to report VM_FAULT_HWPOISON

2023-05-08 Thread Jane Chu

On 5/4/2023 7:32 PM, Dan Williams wrote:

Jane Chu wrote:

When multiple processes mmap() a dax file, then at some point,
a process issues a 'load' and consumes a hwpoison, the process
receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb
set for the poison scope. Soon after, any other process issues
a 'load' to the poisoned page (that is unmapped from the kernel
side by memory_failure), it receives a SIGBUS with
si_code = BUS_ADRERR and without valid si_lsb.

This is confusing to user, and is different from page fault due
to poison in RAM memory, also some helpful information is lost.

Channel dax backend driver's poison detection to the filesystem
such that instead of reporting VM_FAULT_SIGBUS, it could report
VM_FAULT_HWPOISON.


I do think it is interesting that this will start returning SIGBUS with
BUS_MCEERR_AR for stores whereas it is only signalled for loads in the
direct consumption path, but I can't think of a scenario where that
would confuse existing software.


Yes indeed, nice catch, thank you!




Change from v2:
   Convert -EHWPOISON to -EIO to prevent EHWPOISON errno from leaking
out to block read(2) - suggested by Matthew.


For next time these kind of changelog notes belong...


Signed-off-by: Jane Chu 
---


...here after the "---".


I'll move the change log to a cover letter.




  drivers/nvdimm/pmem.c | 2 +-
  fs/dax.c  | 4 ++--
  include/linux/mm.h| 2 ++
  3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index ceea55f621cc..46e094e56159 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, 
pgoff_t pgoff,
long actual_nr;
  
  		if (mode != DAX_RECOVERY_WRITE)

-   return -EIO;
+   return -EHWPOISON;
  
  		/*

 * Set the recovery stride is set to kernel page size because
diff --git a/fs/dax.c b/fs/dax.c
index 2ababb89918d..18f9598951f1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1498,7 +1498,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter 
*iomi,
  
  		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),

DAX_ACCESS, , NULL);
-   if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
+   if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
map_len = dax_direct_access(dax_dev, pgoff,
PHYS_PFN(size), DAX_RECOVERY_WRITE,
, NULL);
@@ -1506,7 +1506,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter 
*iomi,
recovery = true;
}
if (map_len < 0) {
-   ret = map_len;
+   ret = (map_len == -EHWPOISON) ? -EIO : map_len;


This fixup leaves out several other places where EHWPOISON could leak as
the errno for read(2)/write(2). I think I want to see something like
this:

diff --git a/fs/dax.c b/fs/dax.c
index 2ababb89918d..ec17f9977bcb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1077,14 +1077,13 @@ int dax_writeback_mapping_range(struct address_space 
*mapping,
  }
  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
  
-static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,

-   size_t size, void **kaddr, pfn_t *pfnp)
+static int __dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
+size_t size, void **kaddr, pfn_t *pfnp)
  {
 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
-   int id, rc = 0;
 long length;
+   int rc = 0;
  
-   id = dax_read_lock();

 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
DAX_ACCESS, kaddr, pfnp);
 if (length < 0) {
@@ -1113,6 +1112,36 @@ static int dax_iomap_direct_access(const struct iomap 
*iomap, loff_t pos,
 return rc;
  }
  
+static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,

+  size_t size, void **kaddr, pfn_t *pfnp)
+{
+
+   int id;
+
+   id = dax_read_lock();
+   rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
+   dax_read_unlock(id);
+
+   /* don't leak a memory access error code to I/O syscalls */
+   if (rc == -EHWPOISON)
+   return -EIO;
+   return rc;
+}
+
+static int dax_fault_direct_access(const struct iomap *iomap, loff_t pos,
+  size_t size, void **kaddr, pfn_t *pfnp)
+{
+
+   int id;
+
+   id = dax_read_lock();
+   rc = __dax_iomap_direct_access(iomap, pos, size, kaddr, pfnp);
+   dax_read_unlock(id);
+
+   /* -EHWPOISON return ok */
+   return rc;
+}
+
  /**
   * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
   * by copying the data before and after the