Add a sample CXL type-3 driver that registers device memory as
private-node NUMA memory reachable only via explicit mempolicy
(set_mempolicy / mbind).
Probe flow:
1. Call cxl_pci_type3_probe_init() for standard CXL device setup
2. Look for pre-committed RAM regions; if none exist, create one
using cxl_get_hpa_freespace() + cxl_request_dpa() +
cxl_create_region()
3. Convert the region to sysram via devm_cxl_add_sysram() with
private=true and MMOP_ONLINE_MOVABLE
4. Register node_private_ops with NP_OPS_MIGRATION | NP_OPS_MEMPOLICY
so the node is excluded from default allocations
The migrate_to callback uses alloc_migration_target() with
__GFP_THISNODE | __GFP_PRIVATE to keep pages on the target node.
Move struct migration_target_control from mm/internal.h to
include/linux/migrate.h so the driver can use alloc_migration_target()
without depending on mm-internal headers.
Usage:
echo $PCI_DEV > /sys/bus/pci/drivers/cxl_pci/unbind
echo $PCI_DEV > /sys/bus/pci/drivers/cxl_mempolicy/bind
Signed-off-by: Gregory Price <[email protected]>
---
drivers/cxl/Kconfig | 2 +
drivers/cxl/Makefile | 2 +
drivers/cxl/type3_drivers/Kconfig | 2 +
drivers/cxl/type3_drivers/Makefile | 2 +
.../cxl/type3_drivers/cxl_mempolicy/Kconfig | 16 +
.../cxl/type3_drivers/cxl_mempolicy/Makefile | 4 +
.../type3_drivers/cxl_mempolicy/mempolicy.c | 297 ++++++++++++++++++
include/linux/migrate.h | 7 +-
mm/internal.h | 7 -
9 files changed, 331 insertions(+), 8 deletions(-)
create mode 100644 drivers/cxl/type3_drivers/Kconfig
create mode 100644 drivers/cxl/type3_drivers/Makefile
create mode 100644 drivers/cxl/type3_drivers/cxl_mempolicy/Kconfig
create mode 100644 drivers/cxl/type3_drivers/cxl_mempolicy/Makefile
create mode 100644 drivers/cxl/type3_drivers/cxl_mempolicy/mempolicy.c
diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index f99aa7274d12..1648cdeaa0c9 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -278,4 +278,6 @@ config CXL_ATL
depends on CXL_REGION
depends on ACPI_PRMT && AMD_NB
+source "drivers/cxl/type3_drivers/Kconfig"
+
endif
diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile
index 2caa90fa4bf2..94d2b2233bf8 100644
--- a/drivers/cxl/Makefile
+++ b/drivers/cxl/Makefile
@@ -19,3 +19,5 @@ cxl_acpi-y := acpi.o
cxl_pmem-y := pmem.o security.o
cxl_mem-y := mem.o
cxl_pci-y := pci.o
+
+obj-y += type3_drivers/
diff --git a/drivers/cxl/type3_drivers/Kconfig
b/drivers/cxl/type3_drivers/Kconfig
new file mode 100644
index 000000000000..369b21763856
--- /dev/null
+++ b/drivers/cxl/type3_drivers/Kconfig
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+source "drivers/cxl/type3_drivers/cxl_mempolicy/Kconfig"
diff --git a/drivers/cxl/type3_drivers/Makefile
b/drivers/cxl/type3_drivers/Makefile
new file mode 100644
index 000000000000..2b82265ff118
--- /dev/null
+++ b/drivers/cxl/type3_drivers/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_CXL_MEMPOLICY) += cxl_mempolicy/
diff --git a/drivers/cxl/type3_drivers/cxl_mempolicy/Kconfig
b/drivers/cxl/type3_drivers/cxl_mempolicy/Kconfig
new file mode 100644
index 000000000000..3c45da237b9f
--- /dev/null
+++ b/drivers/cxl/type3_drivers/cxl_mempolicy/Kconfig
@@ -0,0 +1,16 @@
+config CXL_MEMPOLICY
+ tristate "CXL Private Memory with Mempolicy Support"
+ depends on CXL_PCI
+ depends on CXL_REGION
+ depends on NUMA
+ depends on MIGRATION
+ help
+ Minimal driver for CXL memory devices that registers memory as
+ N_MEMORY_PRIVATE with mempolicy support. The memory is isolated
+ from default allocations and can only be reached via explicit
+ mempolicy (set_mempolicy or mbind).
+
+ No compression, no PTE controls, the memory behaves like normal
+ DRAM but is excluded from fallback allocations.
+
+ If unsure say 'n'.
diff --git a/drivers/cxl/type3_drivers/cxl_mempolicy/Makefile
b/drivers/cxl/type3_drivers/cxl_mempolicy/Makefile
new file mode 100644
index 000000000000..dfb58fc88ad9
--- /dev/null
+++ b/drivers/cxl/type3_drivers/cxl_mempolicy/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_CXL_MEMPOLICY) += cxl_mempolicy.o
+cxl_mempolicy-y := mempolicy.o
+ccflags-y += -I$(srctree)/drivers/cxl
diff --git a/drivers/cxl/type3_drivers/cxl_mempolicy/mempolicy.c
b/drivers/cxl/type3_drivers/cxl_mempolicy/mempolicy.c
new file mode 100644
index 000000000000..1c19818eb268
--- /dev/null
+++ b/drivers/cxl/type3_drivers/cxl_mempolicy/mempolicy.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2026 Meta Platforms, Inc. All rights reserved. */
+/*
+ * CXL Mempolicy Driver
+ *
+ * Minimal driver for CXL memory devices that registers memory as
+ * N_MEMORY_PRIVATE with mempolicy support but no PTE controls. The
+ * memory behaves like normal DRAM but is isolated from default allocations,
+ * it can only be reached via explicit mempolicy (set_mempolicy/mbind).
+ *
+ * Usage:
+ * 1. Unbind device from cxl_pci:
+ * echo $PCI_DEV > /sys/bus/pci/drivers/cxl_pci/unbind
+ * 2. Bind to cxl_mempolicy:
+ * echo $PCI_DEV > /sys/bus/pci/drivers/cxl_mempolicy/bind
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/xarray.h>
+#include <linux/node_private.h>
+#include <linux/migrate.h>
+#include <cxl/mailbox.h>
+#include "cxlmem.h"
+#include "cxl.h"
+
+struct cxl_mempolicy_ctx {
+ struct cxl_region *cxlr;
+ struct cxl_endpoint_decoder *cxled;
+ int nid;
+};
+
+static DEFINE_XARRAY(ctx_xa);
+
+static struct cxl_mempolicy_ctx *memdev_to_ctx(struct cxl_memdev *cxlmd)
+{
+ struct pci_dev *pdev = to_pci_dev(cxlmd->dev.parent);
+
+ return xa_load(&ctx_xa, (unsigned long)pdev);
+}
+
+static int cxl_mempolicy_migrate_to(struct list_head *folios, int nid,
+ enum migrate_mode mode,
+ enum migrate_reason reason,
+ unsigned int *nr_succeeded)
+{
+ struct migration_target_control mtc = {
+ .nid = nid,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE |
+ __GFP_PRIVATE,
+ .reason = reason,
+ };
+
+ return migrate_pages(folios, alloc_migration_target, NULL,
+ (unsigned long)&mtc, mode, reason, nr_succeeded);
+}
+
+static void cxl_mempolicy_folio_migrate(struct folio *src, struct folio *dst)
+{
+}
+
+static const struct node_private_ops cxl_mempolicy_ops = {
+ .migrate_to = cxl_mempolicy_migrate_to,
+ .folio_migrate = cxl_mempolicy_folio_migrate,
+ .flags = NP_OPS_MIGRATION | NP_OPS_MEMPOLICY,
+};
+
+static struct cxl_region *create_ram_region(struct cxl_memdev *cxlmd)
+{
+ struct cxl_mempolicy_ctx *ctx = memdev_to_ctx(cxlmd);
+ struct cxl_root_decoder *cxlrd;
+ struct cxl_endpoint_decoder *cxled;
+ struct cxl_region *cxlr;
+ resource_size_t ram_size, avail;
+
+ ram_size = cxl_ram_size(cxlmd->cxlds);
+ if (ram_size == 0) {
+ dev_info(&cxlmd->dev, "no RAM capacity available\n");
+ return ERR_PTR(-ENODEV);
+ }
+
+ ram_size = ALIGN_DOWN(ram_size, SZ_256M);
+ if (ram_size == 0) {
+ dev_info(&cxlmd->dev,
+ "RAM capacity too small (< 256M)\n");
+ return ERR_PTR(-ENOSPC);
+ }
+
+ dev_info(&cxlmd->dev, "creating RAM region for %lld MB\n",
+ ram_size >> 20);
+
+ cxlrd = cxl_get_hpa_freespace(cxlmd, ram_size, &avail);
+ if (IS_ERR(cxlrd)) {
+ dev_err(&cxlmd->dev, "no HPA freespace: %ld\n",
+ PTR_ERR(cxlrd));
+ return ERR_CAST(cxlrd);
+ }
+
+ cxled = cxl_request_dpa(cxlmd, CXL_PARTMODE_RAM, ram_size);
+ if (IS_ERR(cxled)) {
+ dev_err(&cxlmd->dev, "failed to request DPA: %ld\n",
+ PTR_ERR(cxled));
+ cxl_put_root_decoder(cxlrd);
+ return ERR_CAST(cxled);
+ }
+
+ cxlr = cxl_create_region(cxlrd, &cxled, 1);
+ cxl_put_root_decoder(cxlrd);
+ if (IS_ERR(cxlr)) {
+ dev_err(&cxlmd->dev, "failed to create region: %ld\n",
+ PTR_ERR(cxlr));
+ cxl_dpa_free(cxled);
+ return cxlr;
+ }
+
+ ctx->cxled = cxled;
+ dev_info(&cxlmd->dev, "created region %s\n",
+ dev_name(cxl_region_dev(cxlr)));
+ return cxlr;
+}
+
+static int setup_private_node(struct cxl_memdev *cxlmd,
+ struct cxl_region *cxlr)
+{
+ struct cxl_mempolicy_ctx *ctx = memdev_to_ctx(cxlmd);
+ struct range hpa_range;
+ int rc;
+
+ device_release_driver(cxl_region_dev(cxlr));
+
+ rc = devm_cxl_add_sysram(cxlr, true, MMOP_ONLINE_MOVABLE);
+ if (rc) {
+ dev_err(cxl_region_dev(cxlr),
+ "failed to add sysram: %d\n", rc);
+ if (device_attach(cxl_region_dev(cxlr)) < 0)
+ dev_warn(cxl_region_dev(cxlr),
+ "failed to re-attach driver\n");
+ return rc;
+ }
+
+ rc = cxl_get_region_range(cxlr, &hpa_range);
+ if (rc) {
+ dev_err(cxl_region_dev(cxlr),
+ "failed to get region range: %d\n", rc);
+ return rc;
+ }
+
+ ctx->nid = phys_to_target_node(hpa_range.start);
+ if (ctx->nid == NUMA_NO_NODE)
+ ctx->nid = memory_add_physaddr_to_nid(hpa_range.start);
+
+ rc = node_private_set_ops(ctx->nid, &cxl_mempolicy_ops);
+ if (rc) {
+ dev_err(cxl_region_dev(cxlr),
+ "failed to set ops on node %d: %d\n", ctx->nid, rc);
+ ctx->nid = NUMA_NO_NODE;
+ return rc;
+ }
+
+ dev_info(&cxlmd->dev,
+ "node %d registered as private mempolicy memory\n", ctx->nid);
+ return 0;
+}
+
+static int cxl_mempolicy_attach_probe(struct cxl_memdev *cxlmd)
+{
+ struct cxl_region *regions[8];
+ struct cxl_region *cxlr;
+ int nr, i;
+ int rc;
+
+ dev_info(&cxlmd->dev,
+ "cxl_mempolicy attach: looking for regions\n");
+
+ /* Phase 1: look for pre-committed RAM regions */
+ nr = cxl_get_committed_regions(cxlmd, regions, ARRAY_SIZE(regions));
+ for (i = 0; i < nr; i++) {
+ if (cxl_region_mode(regions[i]) != CXL_PARTMODE_RAM) {
+ put_device(cxl_region_dev(regions[i]));
+ continue;
+ }
+
+ cxlr = regions[i];
+ rc = setup_private_node(cxlmd, cxlr);
+ put_device(cxl_region_dev(cxlr));
+ if (rc == 0) {
+ /* Release remaining region references */
+ for (i++; i < nr; i++)
+ put_device(cxl_region_dev(regions[i]));
+ return 0;
+ }
+ }
+
+ /* Phase 2: no committed regions, create one */
+ dev_info(&cxlmd->dev,
+ "no existing regions, creating RAM region\n");
+
+ cxlr = create_ram_region(cxlmd);
+ if (IS_ERR(cxlr)) {
+ rc = PTR_ERR(cxlr);
+ if (rc == -ENODEV) {
+ dev_info(&cxlmd->dev,
+ "no RAM capacity: %d\n", rc);
+ return 0;
+ }
+ return rc;
+ }
+
+ rc = setup_private_node(cxlmd, cxlr);
+ if (rc) {
+ dev_err(&cxlmd->dev,
+ "failed to setup private node: %d\n", rc);
+ return rc;
+ }
+
+ /* Only take ownership of regions we created (Phase 2) */
+ memdev_to_ctx(cxlmd)->cxlr = cxlr;
+
+ return 0;
+}
+
+static const struct cxl_memdev_attach cxl_mempolicy_attach = {
+ .probe = cxl_mempolicy_attach_probe,
+};
+
+static int cxl_mempolicy_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ struct cxl_mempolicy_ctx *ctx;
+ struct cxl_memdev *cxlmd;
+ int rc;
+
+ dev_info(&pdev->dev, "cxl_mempolicy: probing device\n");
+
+ ctx = devm_kzalloc(&pdev->dev, sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->nid = NUMA_NO_NODE;
+
+ rc = xa_insert(&ctx_xa, (unsigned long)pdev, ctx, GFP_KERNEL);
+ if (rc)
+ return rc;
+
+ cxlmd = cxl_pci_type3_probe_init(pdev, &cxl_mempolicy_attach);
+ if (IS_ERR(cxlmd)) {
+ xa_erase(&ctx_xa, (unsigned long)pdev);
+ return PTR_ERR(cxlmd);
+ }
+
+ dev_info(&pdev->dev, "cxl_mempolicy: probe complete\n");
+ return 0;
+}
+
+static void cxl_mempolicy_remove(struct pci_dev *pdev)
+{
+ struct cxl_mempolicy_ctx *ctx = xa_erase(&ctx_xa, (unsigned long)pdev);
+
+ dev_info(&pdev->dev, "cxl_mempolicy: removing device\n");
+
+ if (!ctx)
+ return;
+
+ if (ctx->nid != NUMA_NO_NODE)
+ WARN_ON(node_private_clear_ops(ctx->nid, &cxl_mempolicy_ops));
+
+ if (ctx->cxlr) {
+ cxl_destroy_region(ctx->cxlr);
+ ctx->cxlr = NULL;
+ }
+
+ if (ctx->cxled) {
+ cxl_dpa_free(ctx->cxled);
+ ctx->cxled = NULL;
+ }
+}
+
+static const struct pci_device_id cxl_mempolicy_pci_tbl[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0d93) },
+ { },
+};
+MODULE_DEVICE_TABLE(pci, cxl_mempolicy_pci_tbl);
+
+static struct pci_driver cxl_mempolicy_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = cxl_mempolicy_pci_tbl,
+ .probe = cxl_mempolicy_probe,
+ .remove = cxl_mempolicy_remove,
+ .driver = {
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+ },
+};
+
+module_pci_driver(cxl_mempolicy_driver);
+
+MODULE_DESCRIPTION("CXL: Private Memory with Mempolicy Support");
+MODULE_LICENSE("GPL v2");
+MODULE_IMPORT_NS("CXL");
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7b2da3875ff2..1f9fb61f3932 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -10,7 +10,12 @@
typedef struct folio *new_folio_t(struct folio *folio, unsigned long private);
typedef void free_folio_t(struct folio *folio, unsigned long private);
-struct migration_target_control;
+struct migration_target_control {
+ int nid; /* preferred node id */
+ nodemask_t *nmask;
+ gfp_t gfp_mask;
+ enum migrate_reason reason;
+};
/**
* struct movable_operations - Driver page migration
diff --git a/mm/internal.h b/mm/internal.h
index 64467ca774f1..85cd11189854 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1352,13 +1352,6 @@ extern const struct trace_print_flags gfpflag_names[];
void setup_zone_pageset(struct zone *zone);
-struct migration_target_control {
- int nid; /* preferred node id */
- nodemask_t *nmask;
- gfp_t gfp_mask;
- enum migrate_reason reason;
-};
-
/*
* mm/filemap.c
*/
--
2.53.0