[PATCH v5 0/1] dax: enable dax fault handler to report VM_FAULT_HWPOISON

2023-06-15 Thread Jane Chu
Change from v4:
Add comments describing when and why dax_mem2blk_err() is used.
Suggested by Dan.

Change from v3:
Prevent leaking EHWPOISON to user level block IO calls such as
zero_range_range, and truncate.  Suggested by Dan.

Change from v2:
Convert EHWPOISON to EIO to prevent EHWPOISON errno from leaking
out to block read(2). Suggested by Matthew.

Jane Chu (1):
  dax: enable dax fault handler to report VM_FAULT_HWPOISON

 drivers/dax/super.c  |  5 -
 drivers/nvdimm/pmem.c|  2 +-
 drivers/s390/block/dcssblk.c |  3 ++-
 fs/dax.c | 11 ++-
 fs/fuse/virtio_fs.c  |  3 ++-
 include/linux/dax.h  | 13 +
 include/linux/mm.h   |  2 ++
 7 files changed, 30 insertions(+), 9 deletions(-)

-- 
2.18.4




[PATCH v5 1/1] dax: enable dax fault handler to report VM_FAULT_HWPOISON

2023-06-15 Thread Jane Chu
When multiple processes mmap() a dax file, then at some point,
a process issues a 'load' and consumes a hwpoison, the process
receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb
set for the poison scope. Soon after, any other process issues
a 'load' to the poisoned page (that is unmapped from the kernel
side by memory_failure), it receives a SIGBUS with
si_code = BUS_ADRERR and without valid si_lsb.

This is confusing to user, and is different from page fault due
to poison in RAM memory, also some helpful information is lost.

Channel dax backend driver's poison detection to the filesystem
such that instead of reporting VM_FAULT_SIGBUS, it could report
VM_FAULT_HWPOISON.

If user level block IO syscalls fail due to poison, the errno will
be converted to EIO to maintain block API consistency.

Signed-off-by: Jane Chu 
---
 drivers/dax/super.c  |  5 -
 drivers/nvdimm/pmem.c|  2 +-
 drivers/s390/block/dcssblk.c |  3 ++-
 fs/dax.c | 11 ++-
 fs/fuse/virtio_fs.c  |  3 ++-
 include/linux/dax.h  | 13 +
 include/linux/mm.h   |  2 ++
 7 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index c4c4728a36e4..0da9232ea175 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -203,6 +203,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t 
pgoff, void *addr,
 int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
size_t nr_pages)
 {
+   int ret;
+
if (!dax_alive(dax_dev))
return -ENXIO;
/*
@@ -213,7 +215,8 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t 
pgoff,
if (nr_pages != 1)
return -EIO;
 
-   return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages);
+   ret = dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages);
+   return dax_mem2blk_err(ret);
 }
 EXPORT_SYMBOL_GPL(dax_zero_page_range);
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index ceea55f621cc..46e094e56159 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, 
pgoff_t pgoff,
long actual_nr;
 
if (mode != DAX_RECOVERY_WRITE)
-   return -EIO;
+   return -EHWPOISON;
 
/*
 * Set the recovery stride is set to kernel page size because
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index c09f2e053bf8..ee47ac520cd4 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -54,7 +54,8 @@ static int dcssblk_dax_zero_page_range(struct dax_device 
*dax_dev,
rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS,
&kaddr, NULL);
if (rc < 0)
-   return rc;
+   return dax_mem2blk_err(rc);
+
memset(kaddr, 0, nr_pages << PAGE_SHIFT);
dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
return 0;
diff --git a/fs/dax.c b/fs/dax.c
index 2ababb89918d..a26eb5abfdc0 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1148,7 +1148,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t 
length, size_t align_size,
if (!zero_edge) {
ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
if (ret)
-   return ret;
+   return dax_mem2blk_err(ret);
}
 
if (copy_all) {
@@ -1310,7 +1310,7 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
 
 out_unlock:
dax_read_unlock(id);
-   return ret;
+   return dax_mem2blk_err(ret);
 }
 
 int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
@@ -1342,7 +1342,8 @@ static int dax_memzero(struct iomap_iter *iter, loff_t 
pos, size_t size)
ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
NULL);
if (ret < 0)
-   return ret;
+   return dax_mem2blk_err(ret);
+
memset(kaddr + offset, 0, size);
if (iomap->flags & IOMAP_F_SHARED)
ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
@@ -1498,7 +1499,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter 
*iomi,
 
map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
DAX_ACCESS, &kaddr, NULL);
-   if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
+   if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
map_len = dax_direct_access(dax_dev, pgoff,
PHYS_PFN(size), DAX_RECOVERY_WRITE,
&kaddr, NULL);
@@ -1506,7 +1507,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter 
*iomi,
recovery = true;
  

[PATCH 0/3] mm: use memmap_on_memory semantics for dax/kmem

2023-06-15 Thread Vishal Verma
The dax/kmem driver can potentially hot-add large amounts of memory
originating from CXL memory expanders, or NVDIMMs, or other 'device
memories'. There is a chance there isn't enough regular system memory
available to fit ythe memmap for this new memory. It's therefore
desirable, if all other conditions are met, for the kmem managed memory
to place its memmap on the newly added memory itself.

Arrange for this by first allowing for a module parameter override for
the mhp_supports_memmap_on_memory() test using a flag, adjusting the
only other caller of this interface in dirvers/acpi/acpi_memoryhotplug.c,
exporting the symbol so it can be called by kmem.c, and finally changing
the kmem driver to add_memory() in chunks of memory_block_size_bytes().

Signed-off-by: Vishal Verma 
---
Vishal Verma (3):
  mm/memory_hotplug: Allow an override for the memmap_on_memory param
  mm/memory_hotplug: Export symbol mhp_supports_memmap_on_memory()
  dax/kmem: Always enroll hotplugged memory for memmap_on_memory

 include/linux/memory_hotplug.h |  2 +-
 drivers/acpi/acpi_memhotplug.c |  2 +-
 drivers/dax/kmem.c | 49 +++---
 mm/memory_hotplug.c| 25 ++---
 4 files changed, 55 insertions(+), 23 deletions(-)
---
base-commit: f1fcbaa18b28dec10281551dfe6ed3a3ed80e3d6
change-id: 20230613-vv-kmem_memmap-5483c8d04279

Best regards,
-- 
Vishal Verma 




[PATCH 1/3] mm/memory_hotplug: Allow an override for the memmap_on_memory param

2023-06-15 Thread Vishal Verma
For memory hotplug to consider MHP_MEMMAP_ON_MEMORY behavior, the
'memmap_on_memory' module parameter was a hard requirement.

In preparation for the dax/kmem driver to use memmap_on_memory
semantics, arrange for the module parameter check to be bypassed via the
appropriate mhp_flag.

Recall that the kmem driver could contribute huge amounts of hotplugged
memory originating from special purposes devices such as CXL memory
expanders. In some cases memmap_on_memory may be the /only/ way this new
memory can be hotplugged. Hence it makes sense for kmem to have a way to
force memmap_on_memory without depending on a module param, if all the
other conditions for it are met.

The only other user of this interface is acpi/acpi_memoryhotplug.c,
which only enables the mhp_flag if an initial
mhp_supports_memmap_on_memory() test passes. Maintain the existing
behavior and semantics for this by performing the initial check from
acpi without the MHP_MEMMAP_ON_MEMORY flag, so its decision falls back
to the module parameter.

Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Andrew Morton 
Cc: David Hildenbrand 
Cc: Oscar Salvador 
Cc: Dan Williams 
Cc: Dave Jiang 
Cc: Dave Hansen 
Cc: Huang Ying 
Signed-off-by: Vishal Verma 
---
 include/linux/memory_hotplug.h |  2 +-
 drivers/acpi/acpi_memhotplug.c |  2 +-
 mm/memory_hotplug.c| 24 
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 9fcbf5706595..c9ddcd3cad70 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -358,7 +358,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int 
nid,
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
  struct mhp_params *params);
 void arch_remove_linear_mapping(u64 start, u64 size);
-extern bool mhp_supports_memmap_on_memory(unsigned long size);
+extern bool mhp_supports_memmap_on_memory(unsigned long size, mhp_t mhp_flags);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24f662d8bd39..119d3bb49753 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -211,7 +211,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
if (!info->length)
continue;
 
-   if (mhp_supports_memmap_on_memory(info->length))
+   if (mhp_supports_memmap_on_memory(info->length, 0))
mhp_flags |= MHP_MEMMAP_ON_MEMORY;
result = __add_memory(mgid, info->start_addr, info->length,
  mhp_flags);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8e0fa209d533..bb3845830922 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1283,15 +1283,21 @@ static int online_memory_block(struct memory_block 
*mem, void *arg)
return device_online(&mem->dev);
 }
 
-bool mhp_supports_memmap_on_memory(unsigned long size)
+bool mhp_supports_memmap_on_memory(unsigned long size, mhp_t mhp_flags)
 {
unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
unsigned long remaining_size = size - vmemmap_size;
 
/*
-* Besides having arch support and the feature enabled at runtime, we
-* need a few more assumptions to hold true:
+* The MHP_MEMMAP_ON_MEMORY flag indicates a caller that wants to force
+* memmap_on_memory (if other conditions are met), regardless of the
+* module parameter. drivers/dax/kmem.c is an example, where large
+* amounts of hotplug memory may come from, and the only option to
+* successfully online all of it is to place the memmap on this memory.
+*
+* Besides having arch support and the feature enabled at runtime or
+* via the mhp_flag, we need a few more assumptions to hold true:
 *
 * a) We span a single memory block: memory onlining/offlinin;g happens
 *in memory block granularity. We don't want the vmemmap of online
@@ -1315,10 +1321,12 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
 *   altmap as an alternative source of memory, and we do not 
exactly
 *   populate a single PMD.
 */
-   return mhp_memmap_on_memory() &&
-  size == memory_block_size_bytes() &&
-  IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
-  IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+
+   if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) || mhp_memmap_on_memory())
+   return size == memory_block_size_bytes() &&
+  IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
+  IS_ALIGNED(remaining_size, (pageblock_nr_pages << 
PAGE_SHIFT));
+   ret

[PATCH 2/3] mm/memory_hotplug: Export symbol mhp_supports_memmap_on_memory()

2023-06-15 Thread Vishal Verma
In preparation for the dax/kmem driver, which can be built as a module,
to use this interface, export it with EXPORT_SYMBOL_GPL().

Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Andrew Morton 
Cc: David Hildenbrand 
Cc: Oscar Salvador 
Cc: Dan Williams 
Cc: Dave Jiang 
Cc: Dave Hansen 
Cc: Huang Ying 
Signed-off-by: Vishal Verma 
---
 mm/memory_hotplug.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index bb3845830922..92922080d3fa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1328,6 +1328,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size, 
mhp_t mhp_flags)
   IS_ALIGNED(remaining_size, (pageblock_nr_pages << 
PAGE_SHIFT));
return false;
 }
+EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory);
 
 /*
  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug

-- 
2.40.1




[PATCH 3/3] dax/kmem: Always enroll hotplugged memory for memmap_on_memory

2023-06-15 Thread Vishal Verma
With DAX memory regions originating from CXL memory expanders or
NVDIMMs, the kmem driver may be hot-adding huge amounts of system memory
on a system without enough 'regular' main memory to support the memmap
for it. To avoid this, ensure that all kmem managed hotplugged memory is
added with the MHP_MEMMAP_ON_MEMORY flag to place the memmap on the
new memory region being hot added.

To do this, call add_memory() in chunks of memory_block_size_bytes() as
that is a requirement for memmap_on_memory. Additionally, Use the
mhp_flag to force the memmap_on_memory checks regardless of the
respective module parameter setting.

Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Andrew Morton 
Cc: David Hildenbrand 
Cc: Oscar Salvador 
Cc: Dan Williams 
Cc: Dave Jiang 
Cc: Dave Hansen 
Cc: Huang Ying 
Signed-off-by: Vishal Verma 
---
 drivers/dax/kmem.c | 49 -
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 7b36db6f1cbd..0751346193ef 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "dax-private.h"
 #include "bus.h"
 
@@ -105,6 +106,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
data->mgid = rc;
 
for (i = 0; i < dev_dax->nr_range; i++) {
+   u64 cur_start, cur_len, remaining;
struct resource *res;
struct range range;
 
@@ -137,21 +139,42 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
res->flags = IORESOURCE_SYSTEM_RAM;
 
/*
-* Ensure that future kexec'd kernels will not treat
-* this as RAM automatically.
+* Add memory in chunks of memory_block_size_bytes() so that
+* it is considered for MHP_MEMMAP_ON_MEMORY
+* @range has already been aligned to memory_block_size_bytes(),
+* so the following loop will always break it down cleanly.
 */
-   rc = add_memory_driver_managed(data->mgid, range.start,
-   range_len(&range), kmem_name, MHP_NID_IS_MGID);
+   cur_start = range.start;
+   cur_len = memory_block_size_bytes();
+   remaining = range_len(&range);
+   while (remaining) {
+   mhp_t mhp_flags = MHP_NID_IS_MGID;
 
-   if (rc) {
-   dev_warn(dev, "mapping%d: %#llx-%#llx memory add 
failed\n",
-   i, range.start, range.end);
-   remove_resource(res);
-   kfree(res);
-   data->res[i] = NULL;
-   if (mapped)
-   continue;
-   goto err_request_mem;
+   if (mhp_supports_memmap_on_memory(cur_len,
+ MHP_MEMMAP_ON_MEMORY))
+   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
+   /*
+* Ensure that future kexec'd kernels will not treat
+* this as RAM automatically.
+*/
+   rc = add_memory_driver_managed(data->mgid, cur_start,
+  cur_len, kmem_name,
+  mhp_flags);
+
+   if (rc) {
+   dev_warn(dev,
+"mapping%d: %#llx-%#llx memory add 
failed\n",
+i, cur_start, cur_start + cur_len - 1);
+   remove_resource(res);
+   kfree(res);
+   data->res[i] = NULL;
+   if (mapped)
+   continue;
+   goto err_request_mem;
+   }
+
+   cur_start += cur_len;
+   remaining -= cur_len;
}
mapped++;
}

-- 
2.40.1




Re: [PATCH 1/3] mm/memory_hotplug: Allow an override for the memmap_on_memory param

2023-06-15 Thread Huang, Ying
Hi, Vishal,

Thanks for your patch!

Vishal Verma  writes:

> For memory hotplug to consider MHP_MEMMAP_ON_MEMORY behavior, the
> 'memmap_on_memory' module parameter was a hard requirement.
>
> In preparation for the dax/kmem driver to use memmap_on_memory
> semantics, arrange for the module parameter check to be bypassed via the
> appropriate mhp_flag.
>
> Recall that the kmem driver could contribute huge amounts of hotplugged
> memory originating from special purposes devices such as CXL memory
> expanders. In some cases memmap_on_memory may be the /only/ way this new
> memory can be hotplugged. Hence it makes sense for kmem to have a way to
> force memmap_on_memory without depending on a module param, if all the
> other conditions for it are met.
>
> The only other user of this interface is acpi/acpi_memoryhotplug.c,
> which only enables the mhp_flag if an initial
> mhp_supports_memmap_on_memory() test passes. Maintain the existing
> behavior and semantics for this by performing the initial check from
> acpi without the MHP_MEMMAP_ON_MEMORY flag, so its decision falls back
> to the module parameter.
>
> Cc: "Rafael J. Wysocki" 
> Cc: Len Brown 
> Cc: Andrew Morton 
> Cc: David Hildenbrand 
> Cc: Oscar Salvador 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Signed-off-by: Vishal Verma 
> ---
>  include/linux/memory_hotplug.h |  2 +-
>  drivers/acpi/acpi_memhotplug.c |  2 +-
>  mm/memory_hotplug.c| 24 
>  3 files changed, 18 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 9fcbf5706595..c9ddcd3cad70 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -358,7 +358,7 @@ extern struct zone *zone_for_pfn_range(int online_type, 
> int nid,
>  extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
> struct mhp_params *params);
>  void arch_remove_linear_mapping(u64 start, u64 size);
> -extern bool mhp_supports_memmap_on_memory(unsigned long size);
> +extern bool mhp_supports_memmap_on_memory(unsigned long size, mhp_t 
> mhp_flags);
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  
>  #endif /* __LINUX_MEMORY_HOTPLUG_H */
> diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
> index 24f662d8bd39..119d3bb49753 100644
> --- a/drivers/acpi/acpi_memhotplug.c
> +++ b/drivers/acpi/acpi_memhotplug.c
> @@ -211,7 +211,7 @@ static int acpi_memory_enable_device(struct 
> acpi_memory_device *mem_device)
>   if (!info->length)
>   continue;
>  
> - if (mhp_supports_memmap_on_memory(info->length))
> + if (mhp_supports_memmap_on_memory(info->length, 0))
>   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
>   result = __add_memory(mgid, info->start_addr, info->length,
> mhp_flags);
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 8e0fa209d533..bb3845830922 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1283,15 +1283,21 @@ static int online_memory_block(struct memory_block 
> *mem, void *arg)
>   return device_online(&mem->dev);
>  }
>  
> -bool mhp_supports_memmap_on_memory(unsigned long size)
> +bool mhp_supports_memmap_on_memory(unsigned long size, mhp_t mhp_flags)
>  {
>   unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
>   unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
>   unsigned long remaining_size = size - vmemmap_size;
>  
>   /*
> -  * Besides having arch support and the feature enabled at runtime, we
> -  * need a few more assumptions to hold true:
> +  * The MHP_MEMMAP_ON_MEMORY flag indicates a caller that wants to force
> +  * memmap_on_memory (if other conditions are met), regardless of the
> +  * module parameter. drivers/dax/kmem.c is an example, where large
> +  * amounts of hotplug memory may come from, and the only option to
> +  * successfully online all of it is to place the memmap on this memory.
> +  *
> +  * Besides having arch support and the feature enabled at runtime or
> +  * via the mhp_flag, we need a few more assumptions to hold true:
>*
>* a) We span a single memory block: memory onlining/offlinin;g happens
>*in memory block granularity. We don't want the vmemmap of online
> @@ -1315,10 +1321,12 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
>*   altmap as an alternative source of memory, and we do not 
> exactly
>*   populate a single PMD.
>*/
> - return mhp_memmap_on_memory() &&
> -size == memory_block_size_bytes() &&
> -IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
> -IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
> +
> + if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) || mhp_memmap_on_memory())
> + re

Re: [PATCH 3/3] dax/kmem: Always enroll hotplugged memory for memmap_on_memory

2023-06-15 Thread Huang, Ying
Vishal Verma  writes:

> With DAX memory regions originating from CXL memory expanders or
> NVDIMMs, the kmem driver may be hot-adding huge amounts of system memory
> on a system without enough 'regular' main memory to support the memmap
> for it. To avoid this, ensure that all kmem managed hotplugged memory is
> added with the MHP_MEMMAP_ON_MEMORY flag to place the memmap on the
> new memory region being hot added.
>
> To do this, call add_memory() in chunks of memory_block_size_bytes() as
> that is a requirement for memmap_on_memory. Additionally, Use the
> mhp_flag to force the memmap_on_memory checks regardless of the
> respective module parameter setting.
>
> Cc: "Rafael J. Wysocki" 
> Cc: Len Brown 
> Cc: Andrew Morton 
> Cc: David Hildenbrand 
> Cc: Oscar Salvador 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Signed-off-by: Vishal Verma 
> ---
>  drivers/dax/kmem.c | 49 -
>  1 file changed, 36 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> index 7b36db6f1cbd..0751346193ef 100644
> --- a/drivers/dax/kmem.c
> +++ b/drivers/dax/kmem.c
> @@ -12,6 +12,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "dax-private.h"
>  #include "bus.h"
>  
> @@ -105,6 +106,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>   data->mgid = rc;
>  
>   for (i = 0; i < dev_dax->nr_range; i++) {
> + u64 cur_start, cur_len, remaining;
>   struct resource *res;
>   struct range range;
>  
> @@ -137,21 +139,42 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>   res->flags = IORESOURCE_SYSTEM_RAM;
>  
>   /*
> -  * Ensure that future kexec'd kernels will not treat
> -  * this as RAM automatically.
> +  * Add memory in chunks of memory_block_size_bytes() so that
> +  * it is considered for MHP_MEMMAP_ON_MEMORY
> +  * @range has already been aligned to memory_block_size_bytes(),
> +  * so the following loop will always break it down cleanly.
>*/
> - rc = add_memory_driver_managed(data->mgid, range.start,
> - range_len(&range), kmem_name, MHP_NID_IS_MGID);
> + cur_start = range.start;
> + cur_len = memory_block_size_bytes();
> + remaining = range_len(&range);
> + while (remaining) {
> + mhp_t mhp_flags = MHP_NID_IS_MGID;
>  
> - if (rc) {
> - dev_warn(dev, "mapping%d: %#llx-%#llx memory add 
> failed\n",
> - i, range.start, range.end);
> - remove_resource(res);
> - kfree(res);
> - data->res[i] = NULL;
> - if (mapped)
> - continue;
> - goto err_request_mem;
> + if (mhp_supports_memmap_on_memory(cur_len,
> +   MHP_MEMMAP_ON_MEMORY))
> + mhp_flags |= MHP_MEMMAP_ON_MEMORY;
> + /*
> +  * Ensure that future kexec'd kernels will not treat
> +  * this as RAM automatically.
> +  */
> + rc = add_memory_driver_managed(data->mgid, cur_start,
> +cur_len, kmem_name,
> +mhp_flags);
> +
> + if (rc) {
> + dev_warn(dev,
> +  "mapping%d: %#llx-%#llx memory add 
> failed\n",
> +  i, cur_start, cur_start + cur_len - 1);
> + remove_resource(res);
> + kfree(res);
> + data->res[i] = NULL;
> + if (mapped)
> + continue;
> + goto err_request_mem;
> + }
> +
> + cur_start += cur_len;
> + remaining -= cur_len;
>   }
>   mapped++;
>   }

It appears that we need to hot-remove memory in the granularity of
memory_block_size_bytes() too, according to try_remove_memory().  If so,
it seems better to allocate one dax_kmem_data.res[] element for each
memory block instead of dax region?

Best Regards,
Huang, Ying