Re: [PATCH v12 00/12] Adding security support for nvdimm
On Mon, 2018-10-08 at 16:55 -0700, Dave Jiang wrote: > The following series implements security support for nvdimm. Mostly > adding > new security DSM support from the Intel NVDIMM DSM spec v1.7, but > also > adding generic support libnvdimm for other vendors. The most > important > security features are unlocking locked nvdimms, and updating/setting > security > passphrase to nvdimms. > > v12: > - Add a mutex for the cached key and remove key_get/key_put messiness > (Dan) > - Move security code to its own C file and wrap under > CONFIG_NVDIMM_SECURITY > in order to fix issue reported by 0-day build without CONFIG_KEYS. Going over this a bit more closely here is some proposed cleanups / fixes: * remove nvdimm_get_key() and just rely on nvdimm->key being not NULL under the key_mutex * open code nvdimm_check_key_len() checks * make all the security op function signatures take an nvdimm rather than a dev * move the release of nvdimm->key to nvdimm_remove() rather than nvdimm_release(), i.e. driver unload vs nvdimm_bus de-registration. * rename nvdimm_replace_key to make_kernel_key * clean up nvdimm_security_change_key() to a be bit more readable and fix a missing key_put() Let me know if these look acceptable and I can fold them into the patch set. diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c index b6381ddbd6c1..429c544e7ac5 100644 --- a/drivers/nvdimm/dimm.c +++ b/drivers/nvdimm/dimm.c @@ -23,6 +23,7 @@ static int nvdimm_probe(struct device *dev) { + struct nvdimm *nvdimm = to_nvdimm(dev); struct nvdimm_drvdata *ndd; int rc; @@ -51,10 +52,10 @@ static int nvdimm_probe(struct device *dev) get_device(dev); kref_init(>kref); - nvdimm_security_get_state(dev); + nvdimm_security_get_state(nvdimm); /* unlock DIMM here before touch label */ - rc = nvdimm_security_unlock_dimm(dev); + rc = nvdimm_security_unlock_dimm(nvdimm); if (rc < 0) dev_warn(dev, "failed to unlock dimm %s\n", dev_name(dev)); @@ -115,6 +116,9 @@ static int nvdimm_probe(struct device *dev) static int nvdimm_remove(struct device *dev) { struct nvdimm_drvdata *ndd = dev_get_drvdata(dev); + struct nvdimm *nvdimm = to_nvdimm(dev); + + nvdimm_security_release(nvdimm); if (!ndd) return 0; diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 2d9915378bbc..84bec3bb025e 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -213,13 +213,7 @@ void nvdimm_clear_locked(struct device *dev) static void nvdimm_release(struct device *dev) { struct nvdimm *nvdimm = to_nvdimm(dev); - struct key *key; - mutex_lock(>key_mutex); - key = nvdimm_get_key(dev); - if (key) - key_put(key); - mutex_unlock(>key_mutex); ida_simple_remove(_ida, nvdimm->id); kfree(nvdimm); } @@ -436,20 +430,20 @@ static ssize_t security_store(struct device *dev, if (rc != 3) return -EINVAL; dev_dbg(dev, "update %#x %#x\n", old_key, new_key); - rc = nvdimm_security_change_key(dev, old_key, new_key); + rc = nvdimm_security_change_key(nvdimm, old_key, new_key); } else if (sysfs_streq(cmd, "disable")) { if (rc != 2) return -EINVAL; dev_dbg(dev, "disable %#x\n", old_key); - rc = nvdimm_security_disable(dev, old_key); + rc = nvdimm_security_disable(nvdimm, old_key); } else if (sysfs_streq(buf, "freeze")) { dev_dbg(dev, "freeze\n"); - rc = nvdimm_security_freeze_lock(dev); + rc = nvdimm_security_freeze_lock(nvdimm); } else if (sysfs_streq(cmd, "erase")) { if (rc != 2) return -EINVAL; dev_dbg(dev, "erase %#x\n", old_key); - rc = nvdimm_security_erase(dev, old_key); + rc = nvdimm_security_erase(nvdimm, old_key); } else return -EINVAL; diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 08e442632a2d..45fcaf45ba2c 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -425,48 +425,47 @@ const u8 *nd_dev_to_uuid(struct device *dev); bool pmem_should_map_pages(struct device *dev); #ifdef CONFIG_NVDIMM_SECURITY -struct key *nvdimm_get_key(struct device *dev); -int nvdimm_security_unlock_dimm(struct device *dev); -int nvdimm_security_get_state(struct device *dev); -int nvdimm_security_change_key(struct device *dev, unsigned int old_keyid, +int nvdimm_security_unlock_dimm(struct nvdimm *nvdimm); +void nvdimm_security_release(struct nvdimm *nvdimm); +int nvdimm_security_get_state(struct nvdimm *nvdimm); +int nvdimm_security_change_key(struct nvdimm *nvdimm, unsigned int old_keyid, unsigned int new_keyid); -int nvdimm_security_disable(struct
Re: [PATCH v5 4/4] mm: Defer ZONE_DEVICE page initialization to the point where we init pgmap
On Tue, Oct 9, 2018 at 1:34 PM Alexander Duyck wrote: > > On 10/9/2018 11:04 AM, Dan Williams wrote: > > On Tue, Oct 9, 2018 at 3:21 AM Yi Zhang wrote: [..] > > That comment is incorrect, device-pages are never onlined. So I think > > we can just skip that call to __SetPageReserved() unless the memory > > range is MEMORY_DEVICE_{PRIVATE,PUBLIC}. > > > > When pages are "onlined" via __free_pages_boot_core they clear the > reserved bit, that is the reason for the comment. The reserved bit is > meant to indicate that the page cannot be swapped out or moved based on > the description of the bit. ...but ZONE_DEVICE pages are never onlined so I would expect memmap_init_zone_device() to know that detail. > I would think with that being the case we still probably need the call > to __SetPageReserved to set the bit with the expectation that it will > not be cleared for device-pages since the pages are not onlined. > Removing the call to __SetPageReserved would probably introduce a number > of regressions as there are multiple spots that use the reserved bit to > determine if a page can be swapped out to disk, mapped as system memory, > or migrated. Right, this is what Yi is working on... the PageReserved flag is problematic for KVM. Auditing those locations it seems as long as we teach hibernation to avoid ZONE_DEVICE ranges we can safely not set the reserved flag for DAX pages. What I'm trying to avoid is a local KVM hack to check for DAX pages when the Reserved flag is not otherwise needed. ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [PATCH v5 4/4] mm: Defer ZONE_DEVICE page initialization to the point where we init pgmap
On 10/9/2018 11:04 AM, Dan Williams wrote: On Tue, Oct 9, 2018 at 3:21 AM Yi Zhang wrote: On 2018-09-25 at 13:21:24 -0700, Alexander Duyck wrote: The ZONE_DEVICE pages were being initialized in two locations. One was with the memory_hotplug lock held and another was outside of that lock. The problem with this is that it was nearly doubling the memory initialization time. Instead of doing this twice, once while holding a global lock and once without, I am opting to defer the initialization to the one outside of the lock. This allows us to avoid serializing the overhead for memory init and we can instead focus on per-node init times. One issue I encountered is that devm_memremap_pages and hmm_devmmem_pages_create were initializing only the pgmap field the same way. One wasn't initializing hmm_data, and the other was initializing it to a poison value. Since this is something that is exposed to the driver in the case of hmm I am opting for a third option and just initializing hmm_data to 0 since this is going to be exposed to unknown third party drivers. Reviewed-by: Pavel Tatashin Signed-off-by: Alexander Duyck --- v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid merge conflicts with other changes in the kernel. v5: No change include/linux/mm.h |2 + kernel/memremap.c | 24 +- mm/hmm.c | 12 --- mm/page_alloc.c| 92 ++-- 4 files changed, 107 insertions(+), 23 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 06d7d7576f8d..7312fb78ef31 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } +extern void memmap_init_zone_device(struct zone *, unsigned long, + unsigned long, struct dev_pagemap *); #else static inline bool is_zone_device_page(const struct page *page) { diff --git a/kernel/memremap.c b/kernel/memremap.c index 5b8600d39931..d0c32e473f82 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) struct vmem_altmap *altmap = pgmap->altmap_valid ? >altmap : NULL; struct resource *res = >res; - unsigned long pfn, pgoff, order; + struct dev_pagemap *conflict_pgmap; pgprot_t pgprot = PAGE_KERNEL; + unsigned long pgoff, order; int error, nid, is_ram; - struct dev_pagemap *conflict_pgmap; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -256,19 +256,13 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (error) goto err_add_memory; - for_each_device_pfn(pfn, pgmap) { - struct page *page = pfn_to_page(pfn); - - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer. It is a bug if a ZONE_DEVICE page is ever - * freed or placed on a driver-private list. Seed the - * storage with LIST_POISON* values. - */ - list_del(>lru); - page->pgmap = pgmap; - percpu_ref_get(pgmap->ref); - } + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, pgmap); devm_add_action(dev, devm_memremap_pages_release, pgmap); diff --git a/mm/hmm.c b/mm/hmm.c index c968e49f7a0c..774d684fa2b4 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) resource_size_t key, align_start, align_size, align_end; struct device *device = devmem->device; int ret, nid, is_ram; - unsigned long pfn; align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); align_size = ALIGN(devmem->resource->start + @@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) align_size >> PAGE_SHIFT, NULL); mem_hotplug_done(); - for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { - struct page *page = pfn_to_page(pfn); + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, >pagemap); - page->pgmap =
Re: [PATCH v12 04/12] nfit/libnvdimm: add unlock of nvdimm support for Intel DIMMs
On 10/09/2018 12:07 PM, Dan Williams wrote: > On Mon, Oct 8, 2018 at 5:01 PM Dave Jiang wrote: >> >> Add support to allow query the security status of the Intel nvdimms and >> also unlock the dimm via the kernel key management APIs. The passphrase is >> expected to be pulled from userspace through keyutils. Moving the Intel >> related bits to its own source file as well. >> >> Signed-off-by: Dave Jiang >> Reviewed-by: Dan Williams >> Signed-off-by: Dan Williams >> --- >> drivers/acpi/nfit/Makefile |1 >> drivers/acpi/nfit/core.c|3 - >> drivers/acpi/nfit/intel.c | 152 >> +++ >> drivers/acpi/nfit/intel.h | 15 +++ >> drivers/nvdimm/Kconfig |5 + >> drivers/nvdimm/Makefile |1 >> drivers/nvdimm/dimm.c |7 ++ >> drivers/nvdimm/dimm_devs.c | 12 +++ >> drivers/nvdimm/dimm_devs_security.c | 129 ++ >> drivers/nvdimm/nd-core.h|5 + >> drivers/nvdimm/nd.h | 21 + >> include/linux/libnvdimm.h | 27 ++ >> include/uapi/linux/ndctl.h |6 + >> tools/testing/nvdimm/Kbuild |2 >> 14 files changed, 383 insertions(+), 3 deletions(-) >> create mode 100644 drivers/acpi/nfit/intel.c >> create mode 100644 drivers/nvdimm/dimm_devs_security.c >> >> diff --git a/drivers/acpi/nfit/Makefile b/drivers/acpi/nfit/Makefile >> index a407e769f103..443c7ef4e6a6 100644 >> --- a/drivers/acpi/nfit/Makefile >> +++ b/drivers/acpi/nfit/Makefile >> @@ -1,3 +1,4 @@ >> obj-$(CONFIG_ACPI_NFIT) := nfit.o >> nfit-y := core.o >> +nfit-$(CONFIG_X86) += intel.o >> nfit-$(CONFIG_X86_MCE) += mce.o >> diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c >> index 94755a4b6327..e5f7c664a7c8 100644 >> --- a/drivers/acpi/nfit/core.c >> +++ b/drivers/acpi/nfit/core.c >> @@ -1904,7 +1904,8 @@ static int acpi_nfit_register_dimms(struct >> acpi_nfit_desc *acpi_desc) >> nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem, >> acpi_nfit_dimm_attribute_groups, >> flags, cmd_mask, flush ? flush->hint_count : >> 0, >> - nfit_mem->flush_wpq, _mem->id[0]); >> + nfit_mem->flush_wpq, _mem->id[0], >> + >> acpi_nfit_get_security_ops(nfit_mem->family)); >> if (!nvdimm) >> return -ENOMEM; >> >> diff --git a/drivers/acpi/nfit/intel.c b/drivers/acpi/nfit/intel.c >> new file mode 100644 >> index ..4bfc1c1da339 >> --- /dev/null >> +++ b/drivers/acpi/nfit/intel.c >> @@ -0,0 +1,152 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +/* Copyright(c) 2018 Intel Corporation. All rights reserved. */ >> +/* >> + * Intel specific NFIT ops >> + */ >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include "intel.h" >> +#include "nfit.h" >> + >> +static int intel_dimm_security_unlock(struct nvdimm_bus *nvdimm_bus, >> + struct nvdimm *nvdimm, const struct nvdimm_key_data *nkey) >> +{ >> + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); >> + int cmd_rc, rc = 0; >> + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); >> + struct { >> + struct nd_cmd_pkg pkg; >> + struct nd_intel_unlock_unit cmd; >> + } nd_cmd = { >> + .pkg = { >> + .nd_command = NVDIMM_INTEL_UNLOCK_UNIT, >> + .nd_family = NVDIMM_FAMILY_INTEL, >> + .nd_size_in = ND_INTEL_PASSPHRASE_SIZE, >> + .nd_size_out = ND_INTEL_STATUS_SIZE, >> + .nd_fw_size = ND_INTEL_STATUS_SIZE, >> + }, >> + .cmd = { >> + .status = 0, >> + }, >> + }; >> + >> + if (!test_bit(NVDIMM_INTEL_UNLOCK_UNIT, _mem->dsm_mask)) >> + return -ENOTTY; >> + >> + memcpy(nd_cmd.cmd.passphrase, nkey->data, >> + sizeof(nd_cmd.cmd.passphrase)); >> + rc = nd_desc->ndctl(nd_desc, nvdimm, ND_CMD_CALL, _cmd, >> + sizeof(nd_cmd), _rc); >> + if (rc < 0) >> + goto out; >> + if (cmd_rc < 0) { >> + rc = cmd_rc; >> + goto out; >> + } >> + >> + switch (nd_cmd.cmd.status) { >> + case 0: >> + break; >> + case ND_INTEL_STATUS_INVALID_PASS: >> + rc = -EINVAL; >> + goto out; >> + case ND_INTEL_STATUS_INVALID_STATE: >> + default: >> + rc = -ENXIO; >> + goto out; >> + } >> + >> + /* >> +* TODO: define a cross arch wbinvd when/if
Re: Problems with VM_MIXEDMAP removal from /proc//smaps
Jan Kara writes: > Hello, > > commit e1fb4a086495 "dax: remove VM_MIXEDMAP for fsdax and device dax" has > removed VM_MIXEDMAP flag from DAX VMAs. Now our testing shows that in the > mean time certain customer of ours started poking into /proc//smaps > and looks at VMA flags there and if VM_MIXEDMAP is missing among the VMA > flags, the application just fails to start complaining that DAX support is > missing in the kernel. The question now is how do we go about this? > > Strictly speaking, this is a userspace visible regression (as much as I > think that application poking into VMA flags at this level is just too > bold). Is there any precedens in handling similar issues with smaps which > really exposes a lot of information that is dependent on kernel > implementation details? > > I have attached a patch that is an obvious "fix" for the issue - just fake > VM_MIXEDMAP flag in smaps. But I'm open to other suggestions... Hi, Jan, I'm intrigued by the use case. Do I understand you correctly that the database in question does not intend to make data persistent from userspace? In other words, fsync/msync system calls are being issued by the database? I guess what I'm really after is a statement of requirements or expectations. It would be great if you could convince the database developer to engage in this discussion directly. Cheers, Jeff ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [PATCH v12 04/12] nfit/libnvdimm: add unlock of nvdimm support for Intel DIMMs
On Mon, Oct 8, 2018 at 5:01 PM Dave Jiang wrote: > > Add support to allow query the security status of the Intel nvdimms and > also unlock the dimm via the kernel key management APIs. The passphrase is > expected to be pulled from userspace through keyutils. Moving the Intel > related bits to its own source file as well. > > Signed-off-by: Dave Jiang > Reviewed-by: Dan Williams > Signed-off-by: Dan Williams > --- > drivers/acpi/nfit/Makefile |1 > drivers/acpi/nfit/core.c|3 - > drivers/acpi/nfit/intel.c | 152 > +++ > drivers/acpi/nfit/intel.h | 15 +++ > drivers/nvdimm/Kconfig |5 + > drivers/nvdimm/Makefile |1 > drivers/nvdimm/dimm.c |7 ++ > drivers/nvdimm/dimm_devs.c | 12 +++ > drivers/nvdimm/dimm_devs_security.c | 129 ++ > drivers/nvdimm/nd-core.h|5 + > drivers/nvdimm/nd.h | 21 + > include/linux/libnvdimm.h | 27 ++ > include/uapi/linux/ndctl.h |6 + > tools/testing/nvdimm/Kbuild |2 > 14 files changed, 383 insertions(+), 3 deletions(-) > create mode 100644 drivers/acpi/nfit/intel.c > create mode 100644 drivers/nvdimm/dimm_devs_security.c > > diff --git a/drivers/acpi/nfit/Makefile b/drivers/acpi/nfit/Makefile > index a407e769f103..443c7ef4e6a6 100644 > --- a/drivers/acpi/nfit/Makefile > +++ b/drivers/acpi/nfit/Makefile > @@ -1,3 +1,4 @@ > obj-$(CONFIG_ACPI_NFIT) := nfit.o > nfit-y := core.o > +nfit-$(CONFIG_X86) += intel.o > nfit-$(CONFIG_X86_MCE) += mce.o > diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c > index 94755a4b6327..e5f7c664a7c8 100644 > --- a/drivers/acpi/nfit/core.c > +++ b/drivers/acpi/nfit/core.c > @@ -1904,7 +1904,8 @@ static int acpi_nfit_register_dimms(struct > acpi_nfit_desc *acpi_desc) > nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem, > acpi_nfit_dimm_attribute_groups, > flags, cmd_mask, flush ? flush->hint_count : > 0, > - nfit_mem->flush_wpq, _mem->id[0]); > + nfit_mem->flush_wpq, _mem->id[0], > + acpi_nfit_get_security_ops(nfit_mem->family)); > if (!nvdimm) > return -ENOMEM; > > diff --git a/drivers/acpi/nfit/intel.c b/drivers/acpi/nfit/intel.c > new file mode 100644 > index ..4bfc1c1da339 > --- /dev/null > +++ b/drivers/acpi/nfit/intel.c > @@ -0,0 +1,152 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* Copyright(c) 2018 Intel Corporation. All rights reserved. */ > +/* > + * Intel specific NFIT ops > + */ > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include "intel.h" > +#include "nfit.h" > + > +static int intel_dimm_security_unlock(struct nvdimm_bus *nvdimm_bus, > + struct nvdimm *nvdimm, const struct nvdimm_key_data *nkey) > +{ > + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); > + int cmd_rc, rc = 0; > + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); > + struct { > + struct nd_cmd_pkg pkg; > + struct nd_intel_unlock_unit cmd; > + } nd_cmd = { > + .pkg = { > + .nd_command = NVDIMM_INTEL_UNLOCK_UNIT, > + .nd_family = NVDIMM_FAMILY_INTEL, > + .nd_size_in = ND_INTEL_PASSPHRASE_SIZE, > + .nd_size_out = ND_INTEL_STATUS_SIZE, > + .nd_fw_size = ND_INTEL_STATUS_SIZE, > + }, > + .cmd = { > + .status = 0, > + }, > + }; > + > + if (!test_bit(NVDIMM_INTEL_UNLOCK_UNIT, _mem->dsm_mask)) > + return -ENOTTY; > + > + memcpy(nd_cmd.cmd.passphrase, nkey->data, > + sizeof(nd_cmd.cmd.passphrase)); > + rc = nd_desc->ndctl(nd_desc, nvdimm, ND_CMD_CALL, _cmd, > + sizeof(nd_cmd), _rc); > + if (rc < 0) > + goto out; > + if (cmd_rc < 0) { > + rc = cmd_rc; > + goto out; > + } > + > + switch (nd_cmd.cmd.status) { > + case 0: > + break; > + case ND_INTEL_STATUS_INVALID_PASS: > + rc = -EINVAL; > + goto out; > + case ND_INTEL_STATUS_INVALID_STATE: > + default: > + rc = -ENXIO; > + goto out; > + } > + > + /* > +* TODO: define a cross arch wbinvd when/if NVDIMM_FAMILY_INTEL > +* support arrives on another arch. > +*/ > + /* DIMM unlocked, invalidate all CPU caches before we read it */ > + wbinvd_on_all_cpus(); > + > +
Re: [PATCH] mm: Preserve _PAGE_DEVMAP across mprotect() calls
Looks good, Reviewed-by: Johannes Thumshirn -- Johannes Thumshirn Storage jthumsh...@suse.de+49 911 74053 689 SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: Felix Imendörffer, Jane Smithard, Graham Norton HRB 21284 (AG Nürnberg) Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [PATCH v5 4/4] mm: Defer ZONE_DEVICE page initialization to the point where we init pgmap
On Tue, Oct 9, 2018 at 3:21 AM Yi Zhang wrote: > > On 2018-09-25 at 13:21:24 -0700, Alexander Duyck wrote: > > The ZONE_DEVICE pages were being initialized in two locations. One was with > > the memory_hotplug lock held and another was outside of that lock. The > > problem with this is that it was nearly doubling the memory initialization > > time. Instead of doing this twice, once while holding a global lock and > > once without, I am opting to defer the initialization to the one outside of > > the lock. This allows us to avoid serializing the overhead for memory init > > and we can instead focus on per-node init times. > > > > One issue I encountered is that devm_memremap_pages and > > hmm_devmmem_pages_create were initializing only the pgmap field the same > > way. One wasn't initializing hmm_data, and the other was initializing it to > > a poison value. Since this is something that is exposed to the driver in > > the case of hmm I am opting for a third option and just initializing > > hmm_data to 0 since this is going to be exposed to unknown third party > > drivers. > > > > Reviewed-by: Pavel Tatashin > > Signed-off-by: Alexander Duyck > > --- > > > > v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid > > merge conflicts with other changes in the kernel. > > v5: No change > > > > include/linux/mm.h |2 + > > kernel/memremap.c | 24 +- > > mm/hmm.c | 12 --- > > mm/page_alloc.c| 92 > > ++-- > > 4 files changed, 107 insertions(+), 23 deletions(-) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 06d7d7576f8d..7312fb78ef31 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct > > page *page) > > { > > return page_zonenum(page) == ZONE_DEVICE; > > } > > +extern void memmap_init_zone_device(struct zone *, unsigned long, > > + unsigned long, struct dev_pagemap *); > > #else > > static inline bool is_zone_device_page(const struct page *page) > > { > > diff --git a/kernel/memremap.c b/kernel/memremap.c > > index 5b8600d39931..d0c32e473f82 100644 > > --- a/kernel/memremap.c > > +++ b/kernel/memremap.c > > @@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct > > dev_pagemap *pgmap) > > struct vmem_altmap *altmap = pgmap->altmap_valid ? > > >altmap : NULL; > > struct resource *res = >res; > > - unsigned long pfn, pgoff, order; > > + struct dev_pagemap *conflict_pgmap; > > pgprot_t pgprot = PAGE_KERNEL; > > + unsigned long pgoff, order; > > int error, nid, is_ram; > > - struct dev_pagemap *conflict_pgmap; > > > > align_start = res->start & ~(SECTION_SIZE - 1); > > align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) > > @@ -256,19 +256,13 @@ void *devm_memremap_pages(struct device *dev, struct > > dev_pagemap *pgmap) > > if (error) > > goto err_add_memory; > > > > - for_each_device_pfn(pfn, pgmap) { > > - struct page *page = pfn_to_page(pfn); > > - > > - /* > > - * ZONE_DEVICE pages union ->lru with a ->pgmap back > > - * pointer. It is a bug if a ZONE_DEVICE page is ever > > - * freed or placed on a driver-private list. Seed the > > - * storage with LIST_POISON* values. > > - */ > > - list_del(>lru); > > - page->pgmap = pgmap; > > - percpu_ref_get(pgmap->ref); > > - } > > + /* > > + * Initialization of the pages has been deferred until now in order > > + * to allow us to do the work while not holding the hotplug lock. > > + */ > > + memmap_init_zone_device(_DATA(nid)->node_zones[ZONE_DEVICE], > > + align_start >> PAGE_SHIFT, > > + align_size >> PAGE_SHIFT, pgmap); > > > > devm_add_action(dev, devm_memremap_pages_release, pgmap); > > > > diff --git a/mm/hmm.c b/mm/hmm.c > > index c968e49f7a0c..774d684fa2b4 100644 > > --- a/mm/hmm.c > > +++ b/mm/hmm.c > > @@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem > > *devmem) > > resource_size_t key, align_start, align_size, align_end; > > struct device *device = devmem->device; > > int ret, nid, is_ram; > > - unsigned long pfn; > > > > align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); > > align_size = ALIGN(devmem->resource->start + > > @@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct > > hmm_devmem *devmem) > > align_size >> PAGE_SHIFT, NULL); > > mem_hotplug_done(); > > > > - for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { > > - struct page *page = pfn_to_page(pfn); > > + /* > > + *
Re: [PATCH] mm: Preserve _PAGE_DEVMAP across mprotect() calls
On Tue, Oct 9, 2018 at 3:19 AM Jan Kara wrote: > > Currently _PAGE_DEVMAP bit is not preserved in mprotect(2) calls. As a > result we will see warnings such as: > > BUG: Bad page map in process JobWrk0013 pte:81803875ea25 pmd:7624381067 > addr:7f093072 vm_flags:28f9 anon_vma: (null) > mapping:97f2384056f0 index:0 > file:457-00fe0030-0009-00ca-0001_2001.fileblock > fault:xfs_filemap_fault [xfs] mmap:xfs_file_mmap [xfs] readpage: > (null) > CPU: 3 PID: 15848 Comm: JobWrk0013 Tainted: GW > 4.12.14-2.g7573215-default #1 SLE12-SP4 (unreleased) > Hardware name: Intel Corporation S2600WFD/S2600WFD, BIOS > SE5C620.86B.01.00.0833.051120182255 05/11/2018 > Call Trace: > dump_stack+0x5a/0x75 > print_bad_pte+0x217/0x2c0 > ? enqueue_task_fair+0x76/0x9f0 > _vm_normal_page+0xe5/0x100 > zap_pte_range+0x148/0x740 > unmap_page_range+0x39a/0x4b0 > unmap_vmas+0x42/0x90 > unmap_region+0x99/0xf0 > ? vma_gap_callbacks_rotate+0x1a/0x20 > do_munmap+0x255/0x3a0 > vm_munmap+0x54/0x80 > SyS_munmap+0x1d/0x30 > do_syscall_64+0x74/0x150 > entry_SYSCALL_64_after_hwframe+0x3d/0xa2 > ... > > when mprotect(2) gets used on DAX mappings. Also there is a wide variety > of other failures that can result from the missing _PAGE_DEVMAP flag > when the area gets used by get_user_pages() later. > > Fix the problem by including _PAGE_DEVMAP in a set of flags that get > preserved by mprotect(2). > > Fixes: 69660fd797c3 ("x86, mm: introduce _PAGE_DEVMAP") > Fixes: ebd31197931d ("powerpc/mm: Add devmap support for ppc64") > CC: sta...@vger.kernel.org > Signed-off-by: Jan Kara Looks good, do you want me to take this upstream along with the livelock fix? ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [PATCH 10/11] UAPI: ndctl: Remove use of PAGE_SIZE [ver #2]
Dan Williams wrote: > Let me know if you want me to pick this up for 4.20. If you could pick this and also 09? Thanks, David ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [PATCH] mm: Preserve _PAGE_DEVMAP across mprotect() calls
On Tue 09-10-18 12:19:17, Jan Kara wrote: > Currently _PAGE_DEVMAP bit is not preserved in mprotect(2) calls. As a > result we will see warnings such as: > > BUG: Bad page map in process JobWrk0013 pte:81803875ea25 pmd:7624381067 > addr:7f093072 vm_flags:28f9 anon_vma: (null) > mapping:97f2384056f0 index:0 > file:457-00fe0030-0009-00ca-0001_2001.fileblock > fault:xfs_filemap_fault [xfs] mmap:xfs_file_mmap [xfs] readpage: > (null) > CPU: 3 PID: 15848 Comm: JobWrk0013 Tainted: GW > 4.12.14-2.g7573215-default #1 SLE12-SP4 (unreleased) > Hardware name: Intel Corporation S2600WFD/S2600WFD, BIOS > SE5C620.86B.01.00.0833.051120182255 05/11/2018 > Call Trace: > dump_stack+0x5a/0x75 > print_bad_pte+0x217/0x2c0 > ? enqueue_task_fair+0x76/0x9f0 > _vm_normal_page+0xe5/0x100 > zap_pte_range+0x148/0x740 > unmap_page_range+0x39a/0x4b0 > unmap_vmas+0x42/0x90 > unmap_region+0x99/0xf0 > ? vma_gap_callbacks_rotate+0x1a/0x20 > do_munmap+0x255/0x3a0 > vm_munmap+0x54/0x80 > SyS_munmap+0x1d/0x30 > do_syscall_64+0x74/0x150 > entry_SYSCALL_64_after_hwframe+0x3d/0xa2 > ... > > when mprotect(2) gets used on DAX mappings. Also there is a wide variety > of other failures that can result from the missing _PAGE_DEVMAP flag > when the area gets used by get_user_pages() later. > > Fix the problem by including _PAGE_DEVMAP in a set of flags that get > preserved by mprotect(2). > > Fixes: 69660fd797c3 ("x86, mm: introduce _PAGE_DEVMAP") > Fixes: ebd31197931d ("powerpc/mm: Add devmap support for ppc64") > CC: sta...@vger.kernel.org > Signed-off-by: Jan Kara Acked-by: Michal Hocko > --- > arch/powerpc/include/asm/book3s/64/pgtable.h | 4 ++-- > arch/x86/include/asm/pgtable_types.h | 2 +- > 2 files changed, 3 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h > b/arch/powerpc/include/asm/book3s/64/pgtable.h > index 2fdc865ca374..2a2486526d1f 100644 > --- a/arch/powerpc/include/asm/book3s/64/pgtable.h > +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h > @@ -114,7 +114,7 @@ > */ > #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ >_PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \ > - _PAGE_SOFT_DIRTY) > + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP) > /* > * user access blocked by key > */ > @@ -132,7 +132,7 @@ > */ > #define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | > \ >_PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \ > - _PAGE_SOFT_DIRTY) > + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP) > > #define H_PTE_PKEY (H_PTE_PKEY_BIT0 | H_PTE_PKEY_BIT1 | H_PTE_PKEY_BIT2 | \ >H_PTE_PKEY_BIT3 | H_PTE_PKEY_BIT4) > diff --git a/arch/x86/include/asm/pgtable_types.h > b/arch/x86/include/asm/pgtable_types.h > index b64acb08a62b..106b7d0e2dae 100644 > --- a/arch/x86/include/asm/pgtable_types.h > +++ b/arch/x86/include/asm/pgtable_types.h > @@ -124,7 +124,7 @@ > */ > #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | > \ >_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ > - _PAGE_SOFT_DIRTY) > + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP) > #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) > > /* > -- > 2.16.4 -- Michal Hocko SUSE Labs ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [PATCH v5 4/4] mm: Defer ZONE_DEVICE page initialization to the point where we init pgmap
On 2018-09-25 at 13:21:24 -0700, Alexander Duyck wrote: > The ZONE_DEVICE pages were being initialized in two locations. One was with > the memory_hotplug lock held and another was outside of that lock. The > problem with this is that it was nearly doubling the memory initialization > time. Instead of doing this twice, once while holding a global lock and > once without, I am opting to defer the initialization to the one outside of > the lock. This allows us to avoid serializing the overhead for memory init > and we can instead focus on per-node init times. > > One issue I encountered is that devm_memremap_pages and > hmm_devmmem_pages_create were initializing only the pgmap field the same > way. One wasn't initializing hmm_data, and the other was initializing it to > a poison value. Since this is something that is exposed to the driver in > the case of hmm I am opting for a third option and just initializing > hmm_data to 0 since this is going to be exposed to unknown third party > drivers. > > Reviewed-by: Pavel Tatashin > Signed-off-by: Alexander Duyck > --- > > v4: Moved moved memmap_init_zone_device to below memmmap_init_zone to avoid > merge conflicts with other changes in the kernel. > v5: No change > > include/linux/mm.h |2 + > kernel/memremap.c | 24 +- > mm/hmm.c | 12 --- > mm/page_alloc.c| 92 > ++-- > 4 files changed, 107 insertions(+), 23 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 06d7d7576f8d..7312fb78ef31 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page > *page) > { > return page_zonenum(page) == ZONE_DEVICE; > } > +extern void memmap_init_zone_device(struct zone *, unsigned long, > + unsigned long, struct dev_pagemap *); > #else > static inline bool is_zone_device_page(const struct page *page) > { > diff --git a/kernel/memremap.c b/kernel/memremap.c > index 5b8600d39931..d0c32e473f82 100644 > --- a/kernel/memremap.c > +++ b/kernel/memremap.c > @@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct > dev_pagemap *pgmap) > struct vmem_altmap *altmap = pgmap->altmap_valid ? > >altmap : NULL; > struct resource *res = >res; > - unsigned long pfn, pgoff, order; > + struct dev_pagemap *conflict_pgmap; > pgprot_t pgprot = PAGE_KERNEL; > + unsigned long pgoff, order; > int error, nid, is_ram; > - struct dev_pagemap *conflict_pgmap; > > align_start = res->start & ~(SECTION_SIZE - 1); > align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) > @@ -256,19 +256,13 @@ void *devm_memremap_pages(struct device *dev, struct > dev_pagemap *pgmap) > if (error) > goto err_add_memory; > > - for_each_device_pfn(pfn, pgmap) { > - struct page *page = pfn_to_page(pfn); > - > - /* > - * ZONE_DEVICE pages union ->lru with a ->pgmap back > - * pointer. It is a bug if a ZONE_DEVICE page is ever > - * freed or placed on a driver-private list. Seed the > - * storage with LIST_POISON* values. > - */ > - list_del(>lru); > - page->pgmap = pgmap; > - percpu_ref_get(pgmap->ref); > - } > + /* > + * Initialization of the pages has been deferred until now in order > + * to allow us to do the work while not holding the hotplug lock. > + */ > + memmap_init_zone_device(_DATA(nid)->node_zones[ZONE_DEVICE], > + align_start >> PAGE_SHIFT, > + align_size >> PAGE_SHIFT, pgmap); > > devm_add_action(dev, devm_memremap_pages_release, pgmap); > > diff --git a/mm/hmm.c b/mm/hmm.c > index c968e49f7a0c..774d684fa2b4 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem > *devmem) > resource_size_t key, align_start, align_size, align_end; > struct device *device = devmem->device; > int ret, nid, is_ram; > - unsigned long pfn; > > align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); > align_size = ALIGN(devmem->resource->start + > @@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem > *devmem) > align_size >> PAGE_SHIFT, NULL); > mem_hotplug_done(); > > - for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { > - struct page *page = pfn_to_page(pfn); > + /* > + * Initialization of the pages has been deferred until now in order > + * to allow us to do the work while not holding the hotplug lock. > + */ > + memmap_init_zone_device(_DATA(nid)->node_zones[ZONE_DEVICE], > + align_start >>
[PATCH] mm: Preserve _PAGE_DEVMAP across mprotect() calls
Currently _PAGE_DEVMAP bit is not preserved in mprotect(2) calls. As a result we will see warnings such as: BUG: Bad page map in process JobWrk0013 pte:81803875ea25 pmd:7624381067 addr:7f093072 vm_flags:28f9 anon_vma: (null) mapping:97f2384056f0 index:0 file:457-00fe0030-0009-00ca-0001_2001.fileblock fault:xfs_filemap_fault [xfs] mmap:xfs_file_mmap [xfs] readpage: (null) CPU: 3 PID: 15848 Comm: JobWrk0013 Tainted: GW 4.12.14-2.g7573215-default #1 SLE12-SP4 (unreleased) Hardware name: Intel Corporation S2600WFD/S2600WFD, BIOS SE5C620.86B.01.00.0833.051120182255 05/11/2018 Call Trace: dump_stack+0x5a/0x75 print_bad_pte+0x217/0x2c0 ? enqueue_task_fair+0x76/0x9f0 _vm_normal_page+0xe5/0x100 zap_pte_range+0x148/0x740 unmap_page_range+0x39a/0x4b0 unmap_vmas+0x42/0x90 unmap_region+0x99/0xf0 ? vma_gap_callbacks_rotate+0x1a/0x20 do_munmap+0x255/0x3a0 vm_munmap+0x54/0x80 SyS_munmap+0x1d/0x30 do_syscall_64+0x74/0x150 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 ... when mprotect(2) gets used on DAX mappings. Also there is a wide variety of other failures that can result from the missing _PAGE_DEVMAP flag when the area gets used by get_user_pages() later. Fix the problem by including _PAGE_DEVMAP in a set of flags that get preserved by mprotect(2). Fixes: 69660fd797c3 ("x86, mm: introduce _PAGE_DEVMAP") Fixes: ebd31197931d ("powerpc/mm: Add devmap support for ppc64") CC: sta...@vger.kernel.org Signed-off-by: Jan Kara --- arch/powerpc/include/asm/book3s/64/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable_types.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 2fdc865ca374..2a2486526d1f 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -114,7 +114,7 @@ */ #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ _PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \ -_PAGE_SOFT_DIRTY) +_PAGE_SOFT_DIRTY | _PAGE_DEVMAP) /* * user access blocked by key */ @@ -132,7 +132,7 @@ */ #define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \ -_PAGE_SOFT_DIRTY) +_PAGE_SOFT_DIRTY | _PAGE_DEVMAP) #define H_PTE_PKEY (H_PTE_PKEY_BIT0 | H_PTE_PKEY_BIT1 | H_PTE_PKEY_BIT2 | \ H_PTE_PKEY_BIT3 | H_PTE_PKEY_BIT4) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b64acb08a62b..106b7d0e2dae 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -124,7 +124,7 @@ */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ -_PAGE_SOFT_DIRTY) +_PAGE_SOFT_DIRTY | _PAGE_DEVMAP) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) /* -- 2.16.4 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
回复:两天让您销*售不再难
83941420707 详 细 课 纲 请 阅 读 附 件 15:06:09 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm