Re: [Qemu-devel] [RFC 2/2] KVM: add virtio-pmem driver
On Wed, Oct 18, 2017 at 08:51:37AM -0700, Dan Williams wrote: > This use case is not "Persistent Memory". Persistent Memory is > something you can map and make persistent with CPU instructions. > Anything that requires a driver call is device driver managed "Shared > Memory". How is this any different than the existing nvdimm_flush()? If you really care about the not driver thing it could easily be a write to a doorbell page or a hypercall, but in the end that's just semantics. ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [Qemu-devel] [RFC 2/2] KVM: add virtio-pmem driver
On Wed, Oct 18, 2017 at 08:51:37AM -0700, Dan Williams wrote: > On Wed, Oct 18, 2017 at 6:03 AM, Stefan Hajnoczi wrote: > > On Tue, Oct 17, 2017 at 04:30:41AM -0400, Pankaj Gupta wrote: > >> > >> > > Are you saying do it as existing i.e ACPI pmem like interface? > >> > > The reason we have created this new driver is exiting pmem driver > >> > > does not define proper semantics for guest flushing requests. > >> > > >> > At this point I'm caring about the Linux-internal interface, and > >> > for that it should be integrated into the nvdimm subsystem and not > >> > a block driver. How the host <-> guest interface looks is a different > >> > idea. > >> > > >> > > > >> > > Regarding block support of driver, we want to achieve DAX support > >> > > to bypass guest page cache. Also, we want to utilize existing DAX > >> > > capable file-system interfaces(e.g fsync) from userspace file API's > >> > > to trigger the host side flush request. > >> > > >> > Well, if you want to support XFS+DAX better don't make it a block > >> > devices, because I'll post patches soon to stop using the block device > >> > entirely for the DAX case. > >> > >> o.k I will look at your patches once they are in mailing list. > >> Thanks for the heads up. > >> > >> If I am guessing it right, we don't need block device additional features > >> for pmem? We can bypass block device features like blk device cache flush > >> etc. > >> Also, still we would be supporting ext4 & XFS filesystem with pmem? > >> > >> If there is time to your patches can you please elaborate on this a bit. > > > > I think the idea is that the nvdimm subsystem already adds block device > > semantics on top of the struct nvdimms that it manages. See > > drivers/nvdimm/blk.c. > > > > So it would be cleaner to make virtio-pmem an nvdimm bus. This will > > eliminate the duplication between your driver and drivers/nvdimm/ code. > > Try "git grep nvdimm_bus_register" to find drivers that use the nvdimm > > subsystem. > > This use case is not "Persistent Memory". Persistent Memory is > something you can map and make persistent with CPU instructions. > Anything that requires a driver call is device driver managed "Shared > Memory". Dan, in that case do you have ideas regarding Christoph Hellwig's comment that this driver should be integrated into the nvdimm subsystem instead of a new block driver? Stefan ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 08/17] dax: Inline dax_pmd_insert_mapping() into the callsite
dax_pmd_insert_mapping() has only one callsite and we will need to further fine tune what it does for synchronous faults. Just inline it into the callsite so that we don't have to pass awkward bools around. Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara --- fs/dax.c | 47 +-- include/trace/events/fs_dax.h | 1 - 2 files changed, 19 insertions(+), 29 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 5b20c6456926..675fab8ec41f 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1235,33 +1235,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, } #ifdef CONFIG_FS_DAX_PMD -static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void *entry) -{ - struct address_space *mapping = vmf->vma->vm_file->f_mapping; - const sector_t sector = dax_iomap_sector(iomap, pos); - struct inode *inode = mapping->host; - void *ret = NULL; - pfn_t pfn = {}; - int rc; - - rc = dax_iomap_pfn(iomap, pos, PMD_SIZE, &pfn); - if (rc < 0) - goto fallback; - - ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, - RADIX_DAX_PMD); - if (IS_ERR(ret)) - goto fallback; - - trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, ret); - return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, - pfn, vmf->flags & FAULT_FLAG_WRITE); - -fallback: - trace_dax_pmd_insert_mapping_fallback(inode, vmf, PMD_SIZE, pfn, ret); - return VM_FAULT_FALLBACK; -} +/* + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up + * more often than one might expect in the below functions. + */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) @@ -1317,6 +1295,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, void *entry; loff_t pos; int error; + pfn_t pfn; /* * Check whether offset isn't beyond end of file now. Caller is @@ -1394,7 +1373,19 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); + error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); + if (error < 0) + goto finish_iomap; + + entry = dax_insert_mapping_entry(mapping, vmf, entry, + dax_iomap_sector(&iomap, pos), + RADIX_DAX_PMD); + if (IS_ERR(entry)) + goto finish_iomap; + + trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); + result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, + write); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index fbc4a06f7310..88a9d19b8ff8 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -148,7 +148,6 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \ TP_ARGS(inode, vmf, length, pfn, radix_entry)) DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); -DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback); DECLARE_EVENT_CLASS(dax_pte_fault_class, TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 12/17] mm: Define MAP_SYNC and VM_SYNC flags
Define new MAP_SYNC flag and corresponding VMA VM_SYNC flag. As the MAP_SYNC flag is not part of LEGACY_MAP_MASK, currently it will be refused by all MAP_SHARED_VALIDATE map attempts and silently ignored for everything else. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/proc/task_mmu.c | 1 + include/linux/mm.h | 1 + include/linux/mman.h| 8 ++-- include/uapi/asm-generic/mman.h | 1 + 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5589b4bd4b85..ea78b37deeaa 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -664,6 +664,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_SYNC)]= "sf", [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)]= "dd", diff --git a/include/linux/mm.h b/include/linux/mm.h index ca72b67153d5..5411cb7442de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -189,6 +189,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_ACCOUNT 0x0010 /* Is a VM accounted object */ #define VM_NORESERVE 0x0020 /* should the VM suppress accounting */ #define VM_HUGETLB 0x0040 /* Huge TLB Page VM */ +#define VM_SYNC0x0080 /* Synchronous page faults */ #define VM_ARCH_1 0x0100 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x0200 /* Wipe VMA contents in child. */ #define VM_DONTDUMP0x0400 /* Do not include in the core dump */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 94b63b4d71ff..8f7cc87828e6 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -9,7 +9,7 @@ /* * Arrange for legacy / undefined architecture specific flags to be - * ignored by default in LEGACY_MAP_MASK. + * ignored by mmap handling code. */ #ifndef MAP_32BIT #define MAP_32BIT 0 @@ -23,6 +23,9 @@ #ifndef MAP_UNINITIALIZED #define MAP_UNINITIALIZED 0 #endif +#ifndef MAP_SYNC +#define MAP_SYNC 0 +#endif /* * The historical set of flags that all mmap implementations implicitly @@ -125,7 +128,8 @@ calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | - _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED); + _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED) | + _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ); } unsigned long vm_commit_limit(void); diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index 7162cd4cca73..00e55627d2df 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -12,6 +12,7 @@ #define MAP_NONBLOCK 0x1 /* do not block on IO */ #define MAP_STACK 0x2 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB0x4 /* create a huge page mapping */ +#define MAP_SYNC 0x8 /* perform synchronous page faults for the mapping */ /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */ -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 14/17] dax: Implement dax_finish_sync_fault()
Implement a function that filesystems can call to finish handling of synchronous page faults. It takes care of syncing appropriare file range and insertion of page table entry. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/dax.c | 83 +++ include/linux/dax.h | 2 ++ include/trace/events/fs_dax.h | 2 ++ 3 files changed, 87 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index bb9ff907738c..78233c716757 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1492,3 +1492,86 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, } } EXPORT_SYMBOL_GPL(dax_iomap_fault); + +/** + * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables + * @vmf: The description of the fault + * @pe_size: Size of entry to be inserted + * @pfn: PFN to insert + * + * This function inserts writeable PTE or PMD entry into page tables for mmaped + * DAX file. It takes care of marking corresponding radix tree entry as dirty + * as well. + */ +static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, + enum page_entry_size pe_size, + pfn_t pfn) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + void *entry, **slot; + pgoff_t index = vmf->pgoff; + int vmf_ret, error; + + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, &slot); + /* Did we race with someone splitting entry or so? */ + if (!entry || + (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || + (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { + put_unlocked_mapping_entry(mapping, index, entry); + spin_unlock_irq(&mapping->tree_lock); + trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, + VM_FAULT_NOPAGE); + return VM_FAULT_NOPAGE; + } + radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + entry = lock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + switch (pe_size) { + case PE_SIZE_PTE: + error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); + vmf_ret = dax_fault_return(error); + break; +#ifdef CONFIG_FS_DAX_PMD + case PE_SIZE_PMD: + vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, + pfn, true); + break; +#endif + default: + vmf_ret = VM_FAULT_FALLBACK; + } + put_locked_mapping_entry(mapping, index); + trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret); + return vmf_ret; +} + +/** + * dax_finish_sync_fault - finish synchronous page fault + * @vmf: The description of the fault + * @pe_size: Size of entry to be inserted + * @pfn: PFN to insert + * + * This function ensures that the file range touched by the page fault is + * stored persistently on the media and handles inserting of appropriate page + * table entry. + */ +int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, + pfn_t pfn) +{ + int err; + loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; + size_t len = 0; + + if (pe_size == PE_SIZE_PTE) + len = PAGE_SIZE; + else if (pe_size == PE_SIZE_PMD) + len = PMD_SIZE; + else + WARN_ON_ONCE(1); + err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); + if (err) + return VM_FAULT_SIGBUS; + return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); +} +EXPORT_SYMBOL_GPL(dax_finish_sync_fault); diff --git a/include/linux/dax.h b/include/linux/dax.h index e7fa4b8f45bc..d403f78b706c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -96,6 +96,8 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t *pfnp, const struct iomap_ops *ops); +int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, + pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index 88a9d19b8ff8..7725459fafef 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -190,6 +190,8 @@ DEFINE_EVENT(dax_pte_fault_class, name, \ DEFINE_PTE_FAULT_EVENT(dax_pte_fault); DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); DEFINE_PTE_FAULT_EVENT(dax_load_hole); +DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite_no_entry)
[PATCH 17/17] xfs: support for synchronous DAX faults
From: Christoph Hellwig Return IOMAP_F_DIRTY from xfs_file_iomap_begin() when asked to prepare blocks for writing and the inode is pinned, and has dirty fields other than the timestamps. In __xfs_filemap_fault() we then detect this case and call dax_finish_sync_fault() to make sure all metadata is committed, and to insert the page table entry. Note that this will also dirty corresponding radix tree entry which is what we want - fsync(2) will still provide data integrity guarantees for applications not using userspace flushing. And applications using userspace flushing can avoid calling fsync(2) and thus avoid the performance overhead. [JK: Added VM_SYNC flag handling] Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/xfs/xfs_file.c | 17 - fs/xfs/xfs_iomap.c | 5 + 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7c6b8def6eed..9b8058774af3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -44,6 +44,7 @@ #include #include #include +#include static const struct vm_operations_struct xfs_file_vm_ops; @@ -1040,7 +1041,11 @@ __xfs_filemap_fault( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, pe_size, NULL, &xfs_iomap_ops); + pfn_t pfn; + + ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops); + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, pe_size, pfn); } else { if (write_fault) ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); @@ -1131,6 +1136,13 @@ xfs_file_mmap( struct file *filp, struct vm_area_struct *vma) { + /* +* We don't support synchronous mappings for non-DAX files. At least +* until someone comes with a sensible use case. +*/ + if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC)) + return -EOPNOTSUPP; + file_accessed(filp); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(file_inode(filp))) @@ -1138,6 +1150,8 @@ xfs_file_mmap( return 0; } +#define XFS_MAP_SUPPORTED (LEGACY_MAP_MASK | MAP_SYNC) + const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read_iter = xfs_file_read_iter, @@ -1149,6 +1163,7 @@ const struct file_operations xfs_file_operations = { .compat_ioctl = xfs_file_compat_ioctl, #endif .mmap = xfs_file_mmap, + .mmap_supported_flags = XFS_MAP_SUPPORTED, .open = xfs_file_open, .release= xfs_file_release, .fsync = xfs_file_fsync, diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f179bdf1644d..b43be199fbdf 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -33,6 +33,7 @@ #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_trans_space.h" +#include "xfs_inode_item.h" #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -1086,6 +1087,10 @@ xfs_file_iomap_begin( trace_xfs_iomap_found(ip, offset, length, 0, &imap); } + if ((flags & IOMAP_WRITE) && xfs_ipincount(ip) && + (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + iomap->flags |= IOMAP_F_DIRTY; + xfs_bmbt_to_iomap(ip, iomap, &imap); if (shared) -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 09/17] dax: Fix comment describing dax_iomap_fault()
Add missing argument description. Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara --- fs/dax.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 675fab8ec41f..5214ed9ba508 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1435,7 +1435,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault - * @ops: iomap ops passed from the file system + * @pe_size: Size of the page to fault in + * @ops: Iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in * their fault handler for DAX files. dax_iomap_fault() assumes the caller -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 04/17] dax: Factor out getting of pfn out of iomap
Factor out code to get pfn out of iomap that is shared between PTE and PMD fault path. Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara --- fs/dax.c | 83 +--- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 0bc42ac294ca..116eef8d6c69 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -825,30 +825,53 @@ static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); } -static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void *entry) +static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, +pfn_t *pfnp) { const sector_t sector = dax_iomap_sector(iomap, pos); - struct vm_area_struct *vma = vmf->vma; - struct address_space *mapping = vma->vm_file->f_mapping; - unsigned long vaddr = vmf->address; - void *ret, *kaddr; pgoff_t pgoff; + void *kaddr; int id, rc; - pfn_t pfn; + long length; - rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); + rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff); if (rc) return rc; - id = dax_read_lock(); - rc = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), - &kaddr, &pfn); - if (rc < 0) { - dax_read_unlock(id); - return rc; + length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), + &kaddr, pfnp); + if (length < 0) { + rc = length; + goto out; } + rc = -EINVAL; + if (PFN_PHYS(length) < size) + goto out; + if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) + goto out; + /* For larger pages we need devmap */ + if (length > 1 && !pfn_t_devmap(*pfnp)) + goto out; + rc = 0; +out: dax_read_unlock(id); + return rc; +} + +static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, + loff_t pos, void *entry) +{ + const sector_t sector = dax_iomap_sector(iomap, pos); + struct vm_area_struct *vma = vmf->vma; + struct address_space *mapping = vma->vm_file->f_mapping; + unsigned long vaddr = vmf->address; + void *ret; + int rc; + pfn_t pfn; + + rc = dax_iomap_pfn(iomap, pos, PAGE_SIZE, &pfn); + if (rc < 0) + return rc; ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); if (IS_ERR(ret)) @@ -1223,46 +1246,26 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, { struct address_space *mapping = vmf->vma->vm_file->f_mapping; const sector_t sector = dax_iomap_sector(iomap, pos); - struct dax_device *dax_dev = iomap->dax_dev; - struct block_device *bdev = iomap->bdev; struct inode *inode = mapping->host; - const size_t size = PMD_SIZE; - void *ret = NULL, *kaddr; - long length = 0; - pgoff_t pgoff; + void *ret = NULL; pfn_t pfn = {}; - int id; + int rc; - if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) + rc = dax_iomap_pfn(iomap, pos, PMD_SIZE, &pfn); + if (rc < 0) goto fallback; - id = dax_read_lock(); - length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); - if (length < 0) - goto unlock_fallback; - length = PFN_PHYS(length); - - if (length < size) - goto unlock_fallback; - if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) - goto unlock_fallback; - if (!pfn_t_devmap(pfn)) - goto unlock_fallback; - dax_read_unlock(id); - ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, RADIX_DAX_PMD); if (IS_ERR(ret)) goto fallback; - trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); + trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, ret); return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, vmf->flags & FAULT_FLAG_WRITE); -unlock_fallback: - dax_read_unlock(id); fallback: - trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); + trace_dax_pmd_insert_mapping_fallback(inode, vmf, PMD_SIZE, pfn, ret); return VM_FAULT_FALLBACK; } -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 10/17] dax: Allow dax_iomap_fault() to return pfn
For synchronous page fault dax_iomap_fault() will need to return PFN which will then need to be inserted into page tables after fsync() completes. Add necessary parameter to dax_iomap_fault(). Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara --- fs/dax.c| 13 +++-- fs/ext2/file.c | 2 +- fs/ext4/file.c | 2 +- fs/xfs/xfs_file.c | 4 ++-- include/linux/dax.h | 2 +- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 5214ed9ba508..5ddf15161390 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1079,7 +1079,7 @@ static int dax_fault_return(int error) return VM_FAULT_SIGBUS; } -static int dax_iomap_pte_fault(struct vm_fault *vmf, +static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; @@ -1280,7 +1280,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, return VM_FAULT_FALLBACK; } -static int dax_iomap_pmd_fault(struct vm_fault *vmf, +static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; @@ -1425,7 +1425,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, return result; } #else -static int dax_iomap_pmd_fault(struct vm_fault *vmf, +static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; @@ -1436,6 +1436,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault * @pe_size: Size of the page to fault in + * @pfnp: PFN to insert for synchronous faults if fsync is required * @ops: Iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in @@ -1444,13 +1445,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, * successfully. */ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - const struct iomap_ops *ops) + pfn_t *pfnp, const struct iomap_ops *ops) { switch (pe_size) { case PE_SIZE_PTE: - return dax_iomap_pte_fault(vmf, ops); + return dax_iomap_pte_fault(vmf, pfnp, ops); case PE_SIZE_PMD: - return dax_iomap_pmd_fault(vmf, ops); + return dax_iomap_pmd_fault(vmf, pfnp, ops); default: return VM_FAULT_FALLBACK; } diff --git a/fs/ext2/file.c b/fs/ext2/file.c index ff3a3636a5ca..d2bb7c96307d 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b1da660ac3bc..3cec0b95672f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -306,7 +306,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, down_read(&EXT4_I(inode)->i_mmap_sem); } if (!IS_ERR(handle)) - result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops); else result = VM_FAULT_SIGBUS; if (write) { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 309e26c9dddb..7c6b8def6eed 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1040,7 +1040,7 @@ __xfs_filemap_fault( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, pe_size, NULL, &xfs_iomap_ops); } else { if (write_fault) ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); @@ -,7 +,7 @@ xfs_filemap_pfn_mkwrite( if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; else if (IS_DAX(inode)) - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &xfs_iomap_ops); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; diff --git a/include/linux/dax.h b/include/linux/dax.h index 122197124b9d..e7fa4b8f45bc 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -95,7 +95,7 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
[PATCH 01/17] mm: introduce MAP_SHARED_VALIDATE, a mechanism to safely define new mmap flags
From: Dan Williams The mmap(2) syscall suffers from the ABI anti-pattern of not validating unknown flags. However, proposals like MAP_SYNC need a mechanism to define new behavior that is known to fail on older kernels without the support. Define a new MAP_SHARED_VALIDATE flag pattern that is guaranteed to fail on all legacy mmap implementations. It is worth noting that the original proposal was for a standalone MAP_VALIDATE flag. However, when that could not be supported by all archs Linus observed: I see why you *think* you want a bitmap. You think you want a bitmap because you want to make MAP_VALIDATE be part of MAP_SYNC etc, so that people can do ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_SYNC, fd, 0); and "know" that MAP_SYNC actually takes. And I'm saying that whole wish is bogus. You're fundamentally depending on special semantics, just make it explicit. It's already not portable, so don't try to make it so. Rename that MAP_VALIDATE as MAP_SHARED_VALIDATE, make it have a value of 0x3, and make people do ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED_VALIDATE | MAP_SYNC, fd, 0); and then the kernel side is easier too (none of that random garbage playing games with looking at the "MAP_VALIDATE bit", but just another case statement in that map type thing. Boom. Done. Similar to ->fallocate() we also want the ability to validate the support for new flags on a per ->mmap() 'struct file_operations' instance basis. Towards that end arrange for flags to be generically validated against a mmap_supported_flags exported by 'struct file_operations'. By default all existing flags are implicitly supported, but new flags require MAP_SHARED_VALIDATE and per-instance-opt-in. Cc: Jan Kara Cc: Arnd Bergmann Cc: Andy Lutomirski Cc: Andrew Morton Suggested-by: Christoph Hellwig Suggested-by: Linus Torvalds Signed-off-by: Dan Williams Signed-off-by: Jan Kara --- arch/alpha/include/uapi/asm/mman.h | 1 + arch/mips/include/uapi/asm/mman.h| 1 + arch/parisc/include/uapi/asm/mman.h | 1 + arch/xtensa/include/uapi/asm/mman.h | 1 + include/linux/fs.h | 1 + include/linux/mman.h | 39 include/uapi/asm-generic/mman-common.h | 1 + mm/mmap.c| 21 +++ tools/include/uapi/asm-generic/mman-common.h | 1 + 9 files changed, 67 insertions(+) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 3b26cc62dadb..92823f24890b 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -14,6 +14,7 @@ #define MAP_TYPE 0x0f/* Mask for type of mapping (OSF/1 is _wrong_) */ #define MAP_FIXED 0x100 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x10/* don't use a file */ +#define MAP_SHARED_VALIDATE 0x3/* share + validate extension flags */ /* not used by linux, but here to make sure we don't clash with OSF/1 defines */ #define _MAP_HASSEMAPHORE 0x0200 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index da3216007fe0..c77689076577 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -30,6 +30,7 @@ #define MAP_PRIVATE0x002 /* Changes are private */ #define MAP_TYPE 0x00f /* Mask for type of mapping */ #define MAP_FIXED 0x010 /* Interpret addr exactly */ +#define MAP_SHARED_VALIDATE 0x3/* share + validate extension flags */ /* not used by linux, but here to make sure we don't clash with ABI defines */ #define MAP_RENAME 0x020 /* Assign page to file */ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 775b5d5e41a1..36b688d52de3 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -14,6 +14,7 @@ #define MAP_TYPE 0x03/* Mask for type of mapping */ #define MAP_FIXED 0x04/* Interpret addr exactly */ #define MAP_ANONYMOUS 0x10/* don't use a file */ +#define MAP_SHARED_VALIDATE 0x3/* share + validate extension flags */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ #define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index b15b278aa314..ec597900eec7 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -37,6 +37,7 @@ #define MAP_PRIVATE0x002 /* Changes are private */ #define MAP_TYPE 0x00f /* Mask for type of mapping */ #define MAP_FIXED 0x010 /* Interpret addr exactly */ +#defi
[PATCH 11/17] dax: Allow tuning whether dax_insert_mapping_entry() dirties entry
Currently we dirty radix tree entry whenever dax_insert_mapping_entry() gets called for a write fault. With synchronous page faults we would like to insert clean radix tree entry and dirty it only once we call fdatasync() and update page tables to save some unnecessary cache flushing. Add 'dirty' argument to dax_insert_mapping_entry() for that. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/dax.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 5ddf15161390..efc210ff6665 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, static void *dax_insert_mapping_entry(struct address_space *mapping, struct vm_fault *vmf, void *entry, sector_t sector, - unsigned long flags) + unsigned long flags, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; void *new_entry; pgoff_t index = vmf->pgoff; - if (vmf->flags & FAULT_FLAG_WRITE) + if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { @@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, entry = new_entry; } - if (vmf->flags & FAULT_FLAG_WRITE) + if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); @@ -881,7 +881,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry, } entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, - RADIX_DAX_ZERO_PAGE); + RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(entry2)) { ret = VM_FAULT_SIGBUS; goto out; @@ -1182,7 +1182,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, dax_iomap_sector(&iomap, pos), -0); +0, write); if (IS_ERR(entry)) { error = PTR_ERR(entry); goto error_finish_iomap; @@ -1258,7 +1258,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, goto fallback; ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, - RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); + RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(ret)) goto fallback; @@ -1379,7 +1379,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, dax_iomap_sector(&iomap, pos), - RADIX_DAX_PMD); + RADIX_DAX_PMD, write); if (IS_ERR(entry)) goto finish_iomap; -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 05/17] dax: Create local variable for VMA in dax_iomap_pte_fault()
There are already two users and more are coming. Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara --- fs/dax.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 116eef8d6c69..c09465884bbe 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1108,7 +1108,8 @@ static int dax_fault_return(int error) static int dax_iomap_pte_fault(struct vm_fault *vmf, const struct iomap_ops *ops) { - struct address_space *mapping = vmf->vma->vm_file->f_mapping; + struct vm_area_struct *vma = vmf->vma; + struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; @@ -1196,7 +1197,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); - count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } error = dax_insert_mapping(vmf, &iomap, pos, entry); -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 02/17] mm: Remove VM_FAULT_HWPOISON_LARGE_MASK
It is unused. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 065d99deb847..ca72b67153d5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1182,8 +1182,6 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ #define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ -#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ - #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \ VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ VM_FAULT_FALLBACK) -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 13/17] dax, iomap: Add support for synchronous faults
Add a flag to iomap interface informing the caller that inode needs fdstasync(2) for returned extent to become persistent and use it in DAX fault code so that we don't map such extents into page tables immediately. Instead we propagate the information that fdatasync(2) is necessary from dax_iomap_fault() with a new VM_FAULT_NEEDDSYNC flag. Filesystem fault handler is then responsible for calling fdatasync(2) and inserting pfn into page tables. Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/dax.c | 39 +-- include/linux/iomap.h | 1 + include/linux/mm.h| 6 +- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index efc210ff6665..bb9ff907738c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1091,6 +1091,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, unsigned flags = IOMAP_FAULT; int error, major = 0; bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync; int vmf_ret = 0; void *entry; pfn_t pfn; @@ -1169,6 +1170,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto finish_iomap; } + sync = (vma->vm_flags & VM_SYNC) && (iomap.flags & IOMAP_F_DIRTY); + switch (iomap.type) { case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { @@ -1182,12 +1185,27 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, dax_iomap_sector(&iomap, pos), -0, write); +0, write && !sync); if (IS_ERR(entry)) { error = PTR_ERR(entry); goto error_finish_iomap; } + /* +* If we are doing synchronous page fault and inode needs fsync, +* we can insert PTE into page tables only after that happens. +* Skip insertion for now and return the pfn so that caller can +* insert it after fsync is done. +*/ + if (sync) { + if (WARN_ON_ONCE(!pfnp)) { + error = -EIO; + goto error_finish_iomap; + } + *pfnp = pfn; + vmf_ret = VM_FAULT_NEEDDSYNC | major; + goto finish_iomap; + } trace_dax_insert_mapping(inode, vmf, entry); if (write) error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); @@ -1287,6 +1305,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, struct address_space *mapping = vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; int result = VM_FAULT_FALLBACK; @@ -1371,6 +1390,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; + sync = (vma->vm_flags & VM_SYNC) && (iomap.flags & IOMAP_F_DIRTY); + switch (iomap.type) { case IOMAP_MAPPED: error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); @@ -1379,10 +1400,24 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, dax_iomap_sector(&iomap, pos), - RADIX_DAX_PMD, write); + RADIX_DAX_PMD, write && !sync); if (IS_ERR(entry)) goto finish_iomap; + /* +* If we are doing synchronous page fault and inode needs fsync, +* we can insert PMD into page tables only after that happens. +* Skip insertion for now and return the pfn so that caller can +* insert it after fsync is done. +*/ + if (sync) { + if (WARN_ON_ONCE(!pfnp)) + goto finish_iomap; + *pfnp = pfn; + result = VM_FAULT_NEEDDSYNC; + goto finish_iomap; + } + trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, write); diff --git a/include/linux/iomap.h b/include/linux/iom
[PATCH 0/17 v4] dax, ext4, xfs: Synchronous page faults
Hello, here is the fourth version of my patches to implement synchronous page faults for DAX mappings to make flushing of DAX mappings possible from userspace so that they can be flushed on finer than page granularity and also avoid the overhead of a syscall. We use a new mmap flag MAP_SYNC to indicate that page faults for the mapping should be synchronous. The guarantee provided by this flag is: While a block is writeably mapped into page tables of this mapping, it is guaranteed to be visible in the file at that offset also after a crash. How I implement this is that ->iomap_begin() indicates by a flag that inode block mapping metadata is unstable and may need flushing (use the same test as whether fdatasync() has metadata to write). If yes, DAX fault handler refrains from inserting / write-enabling the page table entry and returns special flag VM_FAULT_NEEDDSYNC together with a PFN to map to the filesystem fault handler. The handler then calls fdatasync() (vfs_fsync_range()) for the affected range and after that calls DAX code to update the page table entry appropriately. I did some basic performance testing on the patches over ramdisk - timed latency of page faults when faulting 512 pages. I did several tests: with file preallocated / with file empty, with background file copying going on / without it, with / without MAP_SYNC (so that we get comparison). The results are (numbers are in microseconds): File preallocated, no background load no MAP_SYNC: min=9 avg=10 max=46 8 - 15 us: 508 16 - 31 us: 3 32 - 63 us: 1 File preallocated, no background load, MAP_SYNC: min=9 avg=10 max=47 8 - 15 us: 508 16 - 31 us: 2 32 - 63 us: 2 File empty, no background load, no MAP_SYNC: min=21 avg=22 max=70 16 - 31 us: 506 32 - 63 us: 5 64 - 127 us: 1 File empty, no background load, MAP_SYNC: min=40 avg=124 max=242 32 - 63 us: 1 64 - 127 us: 333 128 - 255 us: 178 File empty, background load, no MAP_SYNC: min=21 avg=23 max=67 16 - 31 us: 507 32 - 63 us: 4 64 - 127 us: 1 File empty, background load, MAP_SYNC: min=94 avg=112 max=181 64 - 127 us: 489 128 - 255 us: 23 So here we can see the difference between MAP_SYNC vs non MAP_SYNC is about 100-200 us when we need to wait for transaction commit in this setup. Anyway, here are the patches and AFAICT the series is pretty much complete. The only missing piece are tests which Ross is working on. Comments are welcome. Changes since v3: * updated some changelogs * folded fs support for VM_SYNC flag into patches implementing the functionality * removed ->mmap_validate, use ->mmap_supported_flags instead * added some Reviewed-by tags * added manpage patch Changes since v2: * avoid unnecessary flushing of faulted page (Ross) - I've realized it makes no sense to remeasure my benchmark results (after actually doing that and seeing no difference, sigh) since I use ramdisk and not real PMEM HW and so flushes are ignored. * handle nojournal mode of ext4 * other smaller cleanups & fixes (Ross) * factor larger part of finishing of synchronous fault into a helper (Christoph) * reorder pfnp argument of dax_iomap_fault() (Christoph) * add XFS support from Christoph * use proper MAP_SYNC support in mmap(2) * rebased on top of 4.14-rc4 Changes since v1: * switched to using mmap flag MAP_SYNC * cleaned up fault handlers to avoid passing pfn in vmf->orig_pte * switched to not touching page tables before we are ready to insert final entry as it was unnecessary and not really simplifying anything * renamed fault flag to VM_FAULT_NEEDDSYNC * other smaller fixes found by reviewers Honza ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 16/17] ext4: Support for synchronous DAX faults
We return IOMAP_F_DIRTY flag from ext4_iomap_begin() when asked to prepare blocks for writing and the inode has some uncommitted metadata changes. In the fault handler ext4_dax_fault() we then detect this case (through VM_FAULT_NEEDDSYNC return value) and call helper dax_finish_sync_fault() to flush metadata changes and insert page table entry. Note that this will also dirty corresponding radix tree entry which is what we want - fsync(2) will still provide data integrity guarantees for applications not using userspace flushing. And applications using userspace flushing can avoid calling fsync(2) and thus avoid the performance overhead. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara --- fs/ext4/file.c | 17 - fs/ext4/inode.c | 15 +++ fs/jbd2/journal.c| 17 + include/linux/jbd2.h | 1 + 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 208adfc3e673..20ad883cb330 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -295,6 +296,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); + pfn_t pfn; if (write) { sb_start_pagefault(sb); @@ -310,9 +312,12 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, } else { down_read(&EXT4_I(inode)->i_mmap_sem); } - result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); + /* Handling synchronous page fault? */ + if (result & VM_FAULT_NEEDDSYNC) + result = dax_finish_sync_fault(vmf, pe_size, pfn); up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); } else { @@ -350,6 +355,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb return -EIO; + /* +* We don't support synchronous mappings for non-DAX files. At least +* until someone comes with a sensible use case. +*/ + if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC)) + return -EOPNOTSUPP; + file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -710,6 +722,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) return -EINVAL; } +#define EXT4_SUPPORTED_MAP_FLAGS (LEGACY_MAP_MASK | MAP_SYNC) + const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, @@ -719,6 +733,7 @@ const struct file_operations ext4_file_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, + .mmap_supported_flags = EXT4_SUPPORTED_MAP_FLAGS, .open = ext4_file_open, .release= ext4_release_file, .fsync = ext4_sync_file, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 31db875bc7a1..13a198924a0f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3394,6 +3394,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } #ifdef CONFIG_FS_DAX +static bool ext4_inode_datasync_dirty(struct inode *inode) +{ + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal) + return !jbd2_transaction_committed(journal, + EXT4_I(inode)->i_datasync_tid); + /* Any metadata buffers to write? */ + if (!list_empty(&inode->i_mapping->private_list)) + return true; + return inode->i_state & I_DIRTY_DATASYNC; +} + static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { @@ -3466,6 +3479,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, } iomap->flags = 0; + if ((flags & IOMAP_WRITE) && ext4_inode_datasync_dirty(inode)) + iomap->flags |= IOMAP_F_DIRTY; iomap->bdev = inode->i_sb->s_bdev; iomap->dax_dev = sbi->s_daxdev; iomap->offset = first_block << blkbits; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 7d5ef3bf3f3e..fa8cde498b4b 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -738,6 +738,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) return err; } +/* Return 1 when transaction with given tid has already committed. */ +int jbd2_transaction_committed(journal_t *journal, tid_t tid) +{ + int ret = 1; + + read_l
[PATCH 15/17] ext4: Simplify error handling in ext4_dax_huge_fault()
If transaction starting fails, just bail out of the function immediately instead of checking for that condition throughout the function. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara --- fs/ext4/file.c | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3cec0b95672f..208adfc3e673 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -302,16 +302,17 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); + if (IS_ERR(handle)) { + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + return VM_FAULT_SIGBUS; + } } else { down_read(&EXT4_I(inode)->i_mmap_sem); } - if (!IS_ERR(handle)) - result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops); - else - result = VM_FAULT_SIGBUS; + result = dax_iomap_fault(vmf, pe_size, NULL, &ext4_iomap_ops); if (write) { - if (!IS_ERR(handle)) - ext4_journal_stop(handle); + ext4_journal_stop(handle); up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); } else { -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH] mmap.2: Add description of MAP_SHARED_VALIDATE and MAP_SYNC
Signed-off-by: Jan Kara --- man2/mmap.2 | 30 ++ 1 file changed, 30 insertions(+) diff --git a/man2/mmap.2 b/man2/mmap.2 index 47c3148653be..598ff0c64f7f 100644 --- a/man2/mmap.2 +++ b/man2/mmap.2 @@ -125,6 +125,21 @@ are carried through to the underlying file. to the underlying file requires the use of .BR msync (2).) .TP +.B MAP_SHARED_VALIDATE +The same as +.B MAP_SHARED +except that +.B MAP_SHARED +mappings ignore unknown flags in +.IR flags . +In contrast when creating mapping of +.B MAP_SHARED_VALIDATE +mapping type, the kernel verifies all passed flags are known and fails the +mapping with +.BR EOPNOTSUPP +otherwise. This mapping type is also required to be able to use some mapping +flags. +.TP .B MAP_PRIVATE Create a private copy-on-write mapping. Updates to the mapping are not visible to other processes @@ -352,6 +367,21 @@ option. Because of the security implications, that option is normally enabled only on embedded devices (i.e., devices where one has complete control of the contents of user memory). +.TP +.BR MAP_SYNC " (since Linux 4.15)" +This flags is available only with +.B MAP_SHARED_VALIDATE +mapping type. Mappings of +.B MAP_SHARED +type will silently ignore this flag. +This flag is supported only for files supporting DAX (direct mapping of persistent +memory). For other files, creating mapping with this flag results in +.B EOPNOTSUPP +error. Shared file mappings with this flag provide the guarantee that while +some memory is writeably mapped in the address space of the process, it will +be visible in the same file at the same offset even after the system crashes or +is rebooted. This allows users of such mappings to make data modifications +persistent in a more efficient way using appropriate CPU instructions. .PP Of the above flags, only .B MAP_FIXED -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 06/17] dax: Create local variable for vmf->flags & FAULT_FLAG_WRITE test
There are already two users and more are coming. Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara --- fs/dax.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index c09465884bbe..5ea71381dba0 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1116,6 +1116,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, struct iomap iomap = { 0 }; unsigned flags = IOMAP_FAULT; int error, major = 0; + bool write = vmf->flags & FAULT_FLAG_WRITE; int vmf_ret = 0; void *entry; @@ -1130,7 +1131,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto out; } - if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) + if (write && !vmf->cow_page) flags |= IOMAP_WRITE; entry = grab_mapping_entry(mapping, vmf->pgoff, 0); @@ -1207,7 +1208,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: - if (!(vmf->flags & FAULT_FLAG_WRITE)) { + if (!write) { vmf_ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 07/17] dax: Inline dax_insert_mapping() into the callsite
dax_insert_mapping() has only one callsite and we will need to further fine tune what it does for synchronous faults. Just inline it into the callsite so that we don't have to pass awkward bools around. Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara --- fs/dax.c | 46 +++--- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 5ea71381dba0..5b20c6456926 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -858,32 +858,6 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, return rc; } -static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void *entry) -{ - const sector_t sector = dax_iomap_sector(iomap, pos); - struct vm_area_struct *vma = vmf->vma; - struct address_space *mapping = vma->vm_file->f_mapping; - unsigned long vaddr = vmf->address; - void *ret; - int rc; - pfn_t pfn; - - rc = dax_iomap_pfn(iomap, pos, PAGE_SIZE, &pfn); - if (rc < 0) - return rc; - - ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); - if (IS_ERR(ret)) - return PTR_ERR(ret); - - trace_dax_insert_mapping(mapping->host, vmf, ret); - if (vmf->flags & FAULT_FLAG_WRITE) - return vm_insert_mixed_mkwrite(vma, vaddr, pfn); - else - return vm_insert_mixed(vma, vaddr, pfn); -} - /* * The user has performed a load from a hole in the file. Allocating a new * page in the file would cause excessive storage usage for workloads with @@ -1119,6 +1093,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, bool write = vmf->flags & FAULT_FLAG_WRITE; int vmf_ret = 0; void *entry; + pfn_t pfn; trace_dax_pte_fault(inode, vmf, vmf_ret); /* @@ -1201,7 +1176,24 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } - error = dax_insert_mapping(vmf, &iomap, pos, entry); + error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn); + if (error < 0) + goto error_finish_iomap; + + entry = dax_insert_mapping_entry(mapping, vmf, entry, +dax_iomap_sector(&iomap, pos), +0); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto error_finish_iomap; + } + + trace_dax_insert_mapping(inode, vmf, entry); + if (write) + error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); + else + error = vm_insert_mixed(vma, vaddr, pfn); + /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [PATCH 17/17] xfs: support for synchronous DAX faults
> +#define XFS_MAP_SUPPORTED (LEGACY_MAP_MASK | MAP_SYNC) I'd kill this define. Also is there any good reason that we have to add LEGACY_MAP_MASK instead of assuming it's supported in the core? > #endif > .mmap = xfs_file_mmap, > + .mmap_supported_flags = XFS_MAP_SUPPORTED, > .open = xfs_file_open, > .release= xfs_file_release, > .fsync = xfs_file_fsync, I usually either reformat all members to be aligned again, or if that's too much churn (in this case it probably is) just use a single space before the = to minimize the alignment differences. Otherwise your changes look good to me. ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 03/17] dax: Simplify arguments of dax_insert_mapping()
dax_insert_mapping() has lots of arguments and a lot of them is actuall duplicated by passing vm_fault structure as well. Change the function to take the same arguments as dax_pmd_insert_mapping(). Reviewed-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/dax.c | 32 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index f001d8c72a06..0bc42ac294ca 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -820,23 +820,30 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_insert_mapping(struct address_space *mapping, - struct block_device *bdev, struct dax_device *dax_dev, - sector_t sector, size_t size, void *entry, - struct vm_area_struct *vma, struct vm_fault *vmf) +static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) +{ + return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); +} + +static int dax_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, + loff_t pos, void *entry) { + const sector_t sector = dax_iomap_sector(iomap, pos); + struct vm_area_struct *vma = vmf->vma; + struct address_space *mapping = vma->vm_file->f_mapping; unsigned long vaddr = vmf->address; void *ret, *kaddr; pgoff_t pgoff; int id, rc; pfn_t pfn; - rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); if (rc) return rc; id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); + rc = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), + &kaddr, &pfn); if (rc < 0) { dax_read_unlock(id); return rc; @@ -936,11 +943,6 @@ int __dax_zero_page_range(struct block_device *bdev, } EXPORT_SYMBOL_GPL(__dax_zero_page_range); -static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) -{ - return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); -} - static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) @@ -1087,7 +1089,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; - sector_t sector; struct iomap iomap = { 0 }; unsigned flags = IOMAP_FAULT; int error, major = 0; @@ -1140,9 +1141,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto error_finish_iomap; } - sector = dax_iomap_sector(&iomap, pos); - if (vmf->cow_page) { + sector_t sector = dax_iomap_sector(&iomap, pos); + switch (iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: @@ -1175,8 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } - error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, - sector, PAGE_SIZE, entry, vmf->vma, vmf); + error = dax_insert_mapping(vmf, &iomap, pos, entry); /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; -- 2.12.3 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Hi
The original message was received at Thu, 19 Oct 2017 21:56:53 +0800 from lists.01.org [111.41.225.161] - The following addresses had permanent fatal errors - - Transcript of session follows - while talking to lists.01.org.: >>> MAIL From:"Bounced mail" <<< 501 "Bounced mail" ... Refused ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [PATCH 01/17] mm: introduce MAP_SHARED_VALIDATE, a mechanism to safely define new mmap flags
On Thu, Oct 19, 2017 at 5:58 AM, Jan Kara wrote: > > From: Dan Williams > > The mmap(2) syscall suffers from the ABI anti-pattern of not validating > unknown flags. However, proposals like MAP_SYNC need a mechanism to > define new behavior that is known to fail on older kernels without the > support. Define a new MAP_SHARED_VALIDATE flag pattern that is > guaranteed to fail on all legacy mmap implementations. > > It is worth noting that the original proposal was for a standalone > MAP_VALIDATE flag. However, when that could not be supported by all > archs Linus observed: > > I see why you *think* you want a bitmap. You think you want > a bitmap because you want to make MAP_VALIDATE be part of MAP_SYNC > etc, so that people can do > > ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED > | MAP_SYNC, fd, 0); > > and "know" that MAP_SYNC actually takes. > > And I'm saying that whole wish is bogus. You're fundamentally > depending on special semantics, just make it explicit. It's already > not portable, so don't try to make it so. > > Rename that MAP_VALIDATE as MAP_SHARED_VALIDATE, make it have a value > of 0x3, and make people do > > ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED_VALIDATE > | MAP_SYNC, fd, 0); > > and then the kernel side is easier too (none of that random garbage > playing games with looking at the "MAP_VALIDATE bit", but just another > case statement in that map type thing. > > Boom. Done. > > Similar to ->fallocate() we also want the ability to validate the > support for new flags on a per ->mmap() 'struct file_operations' > instance basis. Towards that end arrange for flags to be generically > validated against a mmap_supported_flags exported by 'struct > file_operations'. By default all existing flags are implicitly > supported, but new flags require MAP_SHARED_VALIDATE and > per-instance-opt-in. > > Cc: Jan Kara > Cc: Arnd Bergmann > Cc: Andy Lutomirski > Cc: Andrew Morton > Suggested-by: Christoph Hellwig > Suggested-by: Linus Torvalds > Signed-off-by: Dan Williams > Signed-off-by: Jan Kara > --- > arch/alpha/include/uapi/asm/mman.h | 1 + > arch/mips/include/uapi/asm/mman.h| 1 + > arch/parisc/include/uapi/asm/mman.h | 1 + > arch/xtensa/include/uapi/asm/mman.h | 1 + > include/linux/fs.h | 1 + > include/linux/mman.h | 39 > > include/uapi/asm-generic/mman-common.h | 1 + > mm/mmap.c| 21 +++ > tools/include/uapi/asm-generic/mman-common.h | 1 + > 9 files changed, 67 insertions(+) > > diff --git a/arch/alpha/include/uapi/asm/mman.h > b/arch/alpha/include/uapi/asm/mman.h > index 3b26cc62dadb..92823f24890b 100644 > --- a/arch/alpha/include/uapi/asm/mman.h > +++ b/arch/alpha/include/uapi/asm/mman.h > @@ -14,6 +14,7 @@ > #define MAP_TYPE 0x0f/* Mask for type of mapping (OSF/1 is > _wrong_) */ > #define MAP_FIXED 0x100 /* Interpret addr exactly */ > #define MAP_ANONYMOUS 0x10/* don't use a file */ > +#define MAP_SHARED_VALIDATE 0x3/* share + validate extension > flags */ Looks good, only comment is your original feedback to move this definition next to MAP_SHARED a few lines up in the all places where we define it. ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
what happened to byte-addressable read()/write() to character device DAX?
Hi, Dan, Referring to https://patchwork.kernel.org/project/linux-nvdimm/list/?page=6 [RFC,2/2] mm, fs: daxfile, an interface for byte-addressable updates to pmem I'm unable to find the patch in Linux upstream kernel, and am wondering if there has been a change of plan for providing support to byte granularity read() and write() to character device /dev/dax. I've also just started following the email threads, so I might have missed some critical discussions. Would you mind to elaborate what's happening to the proposed feature? Thanks a lot! Regards, -jane ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [Qemu-devel] [RFC 2/2] KVM: add virtio-pmem driver
On Thu, Oct 19, 2017 at 1:01 AM, Christoph Hellwig wrote: > On Wed, Oct 18, 2017 at 08:51:37AM -0700, Dan Williams wrote: >> This use case is not "Persistent Memory". Persistent Memory is >> something you can map and make persistent with CPU instructions. >> Anything that requires a driver call is device driver managed "Shared >> Memory". > > How is this any different than the existing nvdimm_flush()? If you > really care about the not driver thing it could easily be a write > to a doorbell page or a hypercall, but in the end that's just semantics. The difference is that nvdimm_flush() is not mandatory, and that the platform will automatically perform the same flush at power-fail. Applications should be able to assume that if they are using MAP_SYNC that no other coordination with the kernel or the hypervisor is necessary. Advertising this as a generic Persistent Memory range to the guest means that the guest could theoretically use it with device-dax where there is no driver or filesystem sync interface. The hypervisor will be waiting for flush notifications and the guest will just issue cache flushes and sfence instructions. So, as far as I can see we need to differentiate this virtio-model from standard "Persistent Memory" to the guest and remove the possibility of guests/applications making the wrong assumption. Non-ODP RDMA in a guest comes to mind... ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
转发:致企业的一封信
以下课程上海,深圳,即将主办,席位预定中! 全能型车间主任实战技能 企业安全、环境(EHS)系统化实战训练 详情 请 阅读 附 件 内 容 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: what happened to byte-addressable read()/write() to character device DAX?
jane@oracle.com writes: > Hi, Dan, > > Referring to > > https://patchwork.kernel.org/project/linux-nvdimm/list/?page=6 > > [RFC,2/2] mm, fs: daxfile, an interface for byte-addressable updates > to pmem > > I'm unable to find the patch in Linux upstream kernel, and am wondering if > there has been a change of plan for providing support to byte granularity > read() and write() to character device /dev/dax. > > I've also just started following the email threads, so I might have > missed some critical discussions. Would you mind to elaborate what's > happening to the proposed feature? Device dax only supports mmap for data path access. There are no plans to support read(2) or write(2). Cheers, Jeff ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: what happened to byte-addressable read()/write() to character device DAX?
On Thu, Oct 19, 2017 at 2:41 PM, Jeff Moyer wrote: > jane@oracle.com writes: > >> Hi, Dan, >> >> Referring to >> >> https://patchwork.kernel.org/project/linux-nvdimm/list/?page=6 >> >> [RFC,2/2] mm, fs: daxfile, an interface for byte-addressable updates >> to pmem >> >> I'm unable to find the patch in Linux upstream kernel, and am wondering if >> there has been a change of plan for providing support to byte granularity >> read() and write() to character device /dev/dax. >> >> I've also just started following the email threads, so I might have >> missed some critical discussions. Would you mind to elaborate what's >> happening to the proposed feature? > > Device dax only supports mmap for data path access. There are no plans > to support read(2) or write(2). Yes, and the patch set referenced about was not about read(2)/write(2) it was about support for mmap writes without requiring them to be backstopped by calls to msync(). That proposal morphed into the MAP_SYNC and MAP_DIRECT proposals. MAP_SYNC is targeting 4.15. MAP_DIRECT is being reworked into an RDMA-memory-registration-specific interface. ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: what happened to byte-addressable read()/write() to character device DAX?
On 10/19/2017 03:33 PM, Dan Williams wrote: On Thu, Oct 19, 2017 at 2:41 PM, Jeff Moyer wrote: jane@oracle.com writes: Hi, Dan, Referring to https://patchwork.kernel.org/project/linux-nvdimm/list/?page=6 [RFC,2/2] mm, fs: daxfile, an interface for byte-addressable updates to pmem I'm unable to find the patch in Linux upstream kernel, and am wondering if there has been a change of plan for providing support to byte granularity read() and write() to character device /dev/dax. I've also just started following the email threads, so I might have missed some critical discussions. Would you mind to elaborate what's happening to the proposed feature? Device dax only supports mmap for data path access. There are no plans to support read(2) or write(2). Yes, and the patch set referenced about was not about read(2)/write(2) it was about support for mmap writes without requiring them to be backstopped by calls to msync(). That proposal morphed into the MAP_SYNC and MAP_DIRECT proposals. MAP_SYNC is targeting 4.15. MAP_DIRECT is being reworked into an RDMA-memory-registration-specific interface. Thank you both for the clarification! Is there documents that elaborate the new resolve in detail? -jane ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: what happened to byte-addressable read()/write() to character device DAX?
On Thu, Oct 19, 2017 at 4:59 PM, wrote: [..] > Thank you both for the clarification! > Is there documents that elaborate the new resolve in detail? This might answer some of your questions: https://lwn.net/Articles/731706/ ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [PATCH v2] dev/dax: fix uninitialized variable build warning
On Wed, Oct 18, 2017 at 11:21 AM, Ross Zwisler wrote: > Fix this build warning: > > warning: 'phys' may be used uninitialized in this function > [-Wuninitialized] > > As reported here: > > https://lkml.org/lkml/2017/10/16/152 > http://kisskb.ellerman.id.au/kisskb/buildresult/13181373/log/ > > Signed-off-by: Ross Zwisler > --- > drivers/dax/device.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/drivers/dax/device.c b/drivers/dax/device.c > index e9f3b3e..6833ada 100644 > --- a/drivers/dax/device.c > +++ b/drivers/dax/device.c > @@ -222,7 +222,8 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax > *dev_dax, pgoff_t pgoff, > unsigned long size) > { > struct resource *res; > - phys_addr_t phys; > + /* gcc-4.6.3-nolibc for i386 complains that this is uninitialized */ > + phys_addr_t uninitialized_var(phys); Thanks Ross, applied. ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH v3 04/13] dax: stop using VM_HUGEPAGE for dax
This flag is deprecated in favor of the vma_is_dax() check in transparent_hugepage_enabled() added in commit baabda261424 "mm: always enable thp for dax mappings" Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Ross Zwisler Reviewed-by: Jan Kara Signed-off-by: Dan Williams --- drivers/dax/device.c |1 - fs/ext4/file.c |1 - fs/xfs/xfs_file.c|2 -- 3 files changed, 4 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index ed79d006026e..74a35eb5e6d3 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -450,7 +450,6 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma) return rc; vma->vm_ops = &dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; return 0; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0cc9d205bd96..a54e1b4c49f9 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -352,7 +352,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; } else { vma->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c419c6fdb769..c6780743f8ec 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1133,8 +1133,6 @@ xfs_file_mmap( { file_accessed(filp); vma->vm_ops = &xfs_file_vm_ops; - if (IS_DAX(file_inode(filp))) - vma->vm_flags |= VM_HUGEPAGE; return 0; } ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH v3 00/13] dax: fix dma vs truncate and remove 'page-less' support
Changes since v2 [1]: * Add 'dax: handle truncate of dma-busy pages' which builds on the removal of page-less dax to fix a latent bug handling dma vs truncate. * Disable get_user_pages_fast() for dax * Disable RDMA memory registrations against filesystem-DAX mappings for non-ODP (On Demand Paging / Shared Virtual Memory) hardware. * Fix a compile error when building with HMM enabled --- tl;dr: A brute force approach to ensure that truncate waits for any in-flight DMA before freeing filesystem-DAX blocks to the filesystem's block allocator. While reviewing the MAP_DIRECT proposal Christoph noted: get_user_pages on DAX doesn't give the same guarantees as on pagecache or anonymous memory, and that is the problem we need to fix. In fact I'm pretty sure if we try hard enough (and we might have to try very hard) we can see the same problem with plain direct I/O and without any RDMA involved, e.g. do a larger direct I/O write to memory that is mmap()ed from a DAX file, then truncate the DAX file and reallocate the blocks, and we might corrupt that new file. We'll probably need a special setup where there is little other chance but to reallocate those used blocks. So what we need to do first is to fix get_user_pages vs unmapping DAX mmap()ed blocks, be that from a hole punch, truncate, COW operation, etc. I was able to trigger the failure with "[PATCH v3 08/13] tools/testing/nvdimm: add 'bio_delay' mechanism" to keep block i/o pages busy so a punch-hole operation can truncate the blocks before the DMA finishes. The solution presented is not pretty. It creates a stream of leases, one for each get_user_pages() invocation, and polls page reference counts until DMA stops. We're missing a reliable way to not only trap the DMA-idle event, but also block new references being taken on pages while truncate is allowed to progress. "[PATCH v3 12/13] dax: handle truncate of dma-busy pages" presents other options considered, and notes that this solution can only be viewed as a stop-gap. Given the need to poll page-reference counts this approach builds on the removal of 'page-less DAX' support. From the last submission Andrew asked for clarification on the move to now require pages for DAX. Quoting "[PATCH v3 02/13] dax: require 'struct page' for filesystem dax": Note that when the initial dax support was being merged a few years back there was concern that struct page was unsuitable for use with next generation persistent memory devices. The theoretical concern was that struct page access, being such a hotly used data structure in the kernel, would lead to media wear out. While that was a reasonable conservative starting position it has not held true in practice. We have long since committed to using devm_memremap_pages() to support higher order kernel functionality that needs get_user_pages() and pfn_to_page(). --- Dan Williams (13): dax: quiet bdev_dax_supported() dax: require 'struct page' for filesystem dax dax: stop using VM_MIXEDMAP for dax dax: stop using VM_HUGEPAGE for dax dax: stop requiring a live device for dax_flush() dax: store pfns in the radix dax: warn if dma collides with truncate tools/testing/nvdimm: add 'bio_delay' mechanism IB/core: disable memory registration of fileystem-dax vmas mm: disable get_user_pages_fast() for dax fs: use smp_load_acquire in break_{layout,lease} dax: handle truncate of dma-busy pages xfs: wire up FL_ALLOCATED support arch/powerpc/sysdev/axonram.c |1 drivers/dax/device.c |1 drivers/dax/super.c | 18 +- drivers/infiniband/core/umem.c| 49 - drivers/s390/block/dcssblk.c |1 fs/Kconfig|1 fs/dax.c | 296 - fs/ext2/file.c|1 fs/ext4/file.c|1 fs/locks.c| 17 ++ fs/xfs/xfs_aops.c | 24 +++ fs/xfs/xfs_file.c | 66 +++ fs/xfs/xfs_inode.h|1 fs/xfs/xfs_ioctl.c|7 - include/linux/dax.h | 23 +++ include/linux/fs.h| 32 +++- include/linux/vma.h | 33 mm/gup.c | 75 mm/huge_memory.c |8 - mm/ksm.c |3 mm/madvise.c |2 mm/memory.c | 20 ++ mm/migrate.c |3 mm/mlock.c|5 - mm/mmap.c |8 - tools/testing/nvdimm/Kbuild |1 tools/testing/nvdimm/test/iomap.c | 62 +++ tools/testing/nvdimm/test/nfit.c | 34 tool
[PATCH v3 05/13] dax: stop requiring a live device for dax_flush()
Now that dax_flush() is no longer a driver callback (commit c3ca015fab6d "dax: remove the pmem_dax_ops->flush abstraction"), stop requiring the dax_read_lock() to be held and the device to be alive. This is in preparation for switching filesystem-dax to store pfns instead of sectors in the radix. Signed-off-by: Dan Williams --- drivers/dax/super.c |3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 26c324a5aef4..be65430b4483 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -280,9 +280,6 @@ EXPORT_SYMBOL_GPL(dax_copy_from_iter); void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) { - if (unlikely(!dax_alive(dax_dev))) - return; - if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags))) return; ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH v3 01/13] dax: quiet bdev_dax_supported()
Before we add another failure reason, quiet the existing log messages. Leave it to the caller to decide if bdev_dax_supported() failures are errors worth emitting to the log. Reported-by: Jeff Moyer Signed-off-by: Dan Williams --- drivers/dax/super.c |8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 557b93703532..b0cc8117eebe 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -92,21 +92,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) long len; if (blocksize != PAGE_SIZE) { - pr_err("VFS (%s): error: unsupported blocksize for dax\n", + pr_debug("VFS (%s): error: unsupported blocksize for dax\n", sb->s_id); return -EINVAL; } err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); if (err) { - pr_err("VFS (%s): error: unaligned partition for dax\n", + pr_debug("VFS (%s): error: unaligned partition for dax\n", sb->s_id); return err; } dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); if (!dax_dev) { - pr_err("VFS (%s): error: device does not support dax\n", + pr_debug("VFS (%s): error: device does not support dax\n", sb->s_id); return -EOPNOTSUPP; } @@ -118,7 +118,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) put_dax(dax_dev); if (len < 1) { - pr_err("VFS (%s): error: dax access failed (%ld)", + pr_debug("VFS (%s): error: dax access failed (%ld)\n", sb->s_id, len); return len < 0 ? len : -EIO; } ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH v3 03/13] dax: stop using VM_MIXEDMAP for dax
VM_MIXEDMAP is used by dax to direct mm paths like vm_normal_page() that the memory page it is dealing with is not typical memory from the linear map. The get_user_pages_fast() path, since it does not resolve the vma, is already using {pte,pmd}_devmap() as a stand-in for VM_MIXEDMAP, so we use that as a VM_MIXEDMAP replacement in some locations. In the cases where there is no pte to consult we fallback to using vma_is_dax() to detect the VM_MIXEDMAP special case. Now that we always have pages for DAX we can stop setting VM_MIXEDMAP. This also means we no longer need to worry about safely manipulating vm_flags in a future where we support dynamically changing the dax mode of a file. Cc: Jan Kara Cc: Michal Hocko Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Andrew Morton Cc: Ross Zwisler Cc: "Kirill A. Shutemov" Signed-off-by: Dan Williams --- drivers/dax/device.c |2 +- fs/ext2/file.c |1 - fs/ext4/file.c |2 +- fs/xfs/xfs_file.c|2 +- include/linux/vma.h | 33 + mm/huge_memory.c |8 mm/ksm.c |3 +++ mm/madvise.c |2 +- mm/memory.c | 20 ++-- mm/migrate.c |3 ++- mm/mlock.c |5 +++-- mm/mmap.c|8 12 files changed, 71 insertions(+), 18 deletions(-) create mode 100644 include/linux/vma.h diff --git a/drivers/dax/device.c b/drivers/dax/device.c index e9f3b3e4bbf4..ed79d006026e 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -450,7 +450,7 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma) return rc; vma->vm_ops = &dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vma->vm_flags |= VM_HUGEPAGE; return 0; } diff --git a/fs/ext2/file.c b/fs/ext2/file.c index ff3a3636a5ca..70657e8550ed 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -125,7 +125,6 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &ext2_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; return 0; } #else diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b1da660ac3bc..0cc9d205bd96 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -352,7 +352,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vma->vm_flags |= VM_HUGEPAGE; } else { vma->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 309e26c9dddb..c419c6fdb769 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1134,7 +1134,7 @@ xfs_file_mmap( file_accessed(filp); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(file_inode(filp))) - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vma->vm_flags |= VM_HUGEPAGE; return 0; } diff --git a/include/linux/vma.h b/include/linux/vma.h new file mode 100644 index ..135ad5262cd1 --- /dev/null +++ b/include/linux/vma.h @@ -0,0 +1,33 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __VMA_H__ +#define __VMA_H__ +#include +#include +#include +#include + +/* + * There are several vma types that have special handling in the + * get_user_pages() path and other core mm paths that must not assume + * normal pages. vma_is_special() consolidates checks for VM_SPECIAL, + * hugetlb and dax vmas, but note that there are 'special' vmas and + * special circumstances beyond these types. In other words this helper + * is not exhaustive. + */ +static inline bool vma_is_special(struct vm_area_struct *vma) +{ + return vma && (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) + || vma_is_dax(vma)); +} +#endif /* __VMA_H__ */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 269b5df58543..c69d30e27fd9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -765,11 +765,11 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON(!((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) + || pfn_t_dev
[PATCH v3 10/13] mm: disable get_user_pages_fast() for dax
In preparation for solving the dax-dma vs truncate race, disable get_user_pages_fast(). The race fix relies on the vma being available. We can still support get_user_pages_fast() for 1GB (pud) 'devmap' mappings since those are only implemented for device-dax, everything else needs the vma and the gup-slow-path in case it might be a filesytem-dax mapping. Cc: Michal Hocko Cc: Dave Hansen Cc: "Kirill A. Shutemov" Signed-off-by: Dan Williams --- mm/gup.c | 48 +--- 1 file changed, 13 insertions(+), 35 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index b2b4d4263768..308be897d22a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1290,22 +1290,12 @@ static inline pte_t gup_get_pte(pte_t *ptep) } #endif -static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) -{ - while ((*nr) - nr_start) { - struct page *page = pages[--(*nr)]; - - ClearPageReferenced(page); - put_page(page); - } -} - #ifdef __HAVE_ARCH_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; - int nr_start = *nr, ret = 0; + int ret = 0; pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); @@ -1323,13 +1313,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, if (!pte_access_permitted(pte, write)) goto pte_unmap; - if (pte_devmap(pte)) { - pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); - if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); - goto pte_unmap; - } - } else if (pte_special(pte)) + if (pte_devmap(pte) || (pte_special(pte))) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); @@ -1378,6 +1362,16 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, #endif /* __HAVE_ARCH_PTE_SPECIAL */ #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +{ + while ((*nr) - nr_start) { + struct page *page = pages[--(*nr)]; + + ClearPageReferenced(page); + put_page(page); + } +} + static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, struct page **pages, int *nr) { @@ -1402,15 +1396,6 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, return 1; } -static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - unsigned long fault_pfn; - - fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); -} - static int __gup_device_huge_pud(pud_t pud, unsigned long addr, unsigned long end, struct page **pages, int *nr) { @@ -1420,13 +1405,6 @@ static int __gup_device_huge_pud(pud_t pud, unsigned long addr, return __gup_device_huge(fault_pfn, addr, end, pages, nr); } #else -static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - BUILD_BUG(); - return 0; -} - static int __gup_device_huge_pud(pud_t pud, unsigned long addr, unsigned long end, struct page **pages, int *nr) { @@ -1445,7 +1423,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; if (pmd_devmap(orig)) - return __gup_device_huge_pmd(orig, addr, end, pages, nr); + return 0; refs = 0; page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH v3 02/13] dax: require 'struct page' for filesystem dax
If a dax buffer from a device that does not map pages is passed to read(2) or write(2) as a target for direct-I/O it triggers SIGBUS. If gdb attempts to examine the contents of a dax buffer from a device that does not map pages it triggers SIGBUS. If fork(2) is called on a process with a dax mapping from a device that does not map pages it triggers SIGBUS. 'struct page' is required otherwise several kernel code paths break in surprising ways. Disable filesystem-dax on devices that do not map pages. In addition to needing pfn_to_page() to be valid we also require devmap pages. We need this to detect dax pages in the get_user_pages_fast() path and so that we can stop managing the VM_MIXEDMAP flag. This impacts the dax drivers that do not use dev_memremap_pages(): brd, dcssblk, and axonram. Note that when the initial dax support was being merged a few years back there was concern that struct page was unsuitable for use with next generation persistent memory devices. The theoretical concern was that struct page access, being such a hotly used data structure in the kernel, would lead to media wear out. While that was a reasonable conservative starting position it has not held true in practice. We have long since committed to using devm_memremap_pages() to support higher order kernel functionality that needs get_user_pages() and pfn_to_page(). Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Ross Zwisler Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Gerald Schaefer Signed-off-by: Dan Williams --- arch/powerpc/sysdev/axonram.c |1 + drivers/dax/super.c |7 +++ drivers/s390/block/dcssblk.c |1 + 3 files changed, 9 insertions(+) diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index c60e84e4558d..9da64d95e6f1 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -172,6 +172,7 @@ static size_t axon_ram_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, static const struct dax_operations axon_ram_dax_ops = { .direct_access = axon_ram_dax_direct_access, + .copy_from_iter = axon_ram_copy_from_iter, }; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index b0cc8117eebe..26c324a5aef4 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -123,6 +124,12 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) return len < 0 ? len : -EIO; } + if (!pfn_t_devmap(pfn)) { + pr_debug("VFS (%s): error: dax support not enabled\n", + sb->s_id); + return -EOPNOTSUPP; + } + return 0; } EXPORT_SYMBOL_GPL(__bdev_dax_supported); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 7abb240847c0..e7e5db07e339 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -52,6 +52,7 @@ static size_t dcssblk_dax_copy_from_iter(struct dax_device *dax_dev, static const struct dax_operations dcssblk_dax_ops = { .direct_access = dcssblk_dax_direct_access, + .copy_from_iter = dcssblk_dax_copy_from_iter, }; ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH v3 08/13] tools/testing/nvdimm: add 'bio_delay' mechanism
In support of testing truncate colliding with dma add a mechanism that delays the completion of block I/O requests by a programmable number of seconds. This allows a truncate operation to be issued while page references are held for direct-I/O. Signed-off-by: Dan Williams --- tools/testing/nvdimm/Kbuild |1 + tools/testing/nvdimm/test/iomap.c | 62 + tools/testing/nvdimm/test/nfit.c | 34 ++ tools/testing/nvdimm/test/nfit_test.h |1 + 4 files changed, 98 insertions(+) diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index d870520da68b..5946cf3afe74 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -15,6 +15,7 @@ ldflags-y += --wrap=insert_resource ldflags-y += --wrap=remove_resource ldflags-y += --wrap=acpi_evaluate_object ldflags-y += --wrap=acpi_evaluate_dsm +ldflags-y += --wrap=bio_endio DRIVERS := ../../../drivers NVDIMM_SRC := $(DRIVERS)/nvdimm diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index e1f75a1914a1..1f5d7182ca9c 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -10,6 +10,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include #include #include "nfit_test.h" @@ -388,4 +390,64 @@ union acpi_object * __wrap_acpi_evaluate_dsm(acpi_handle handle, const guid_t *g } EXPORT_SYMBOL(__wrap_acpi_evaluate_dsm); +static DEFINE_SPINLOCK(bio_lock); +static struct bio *biolist; +int bio_do_queue; + +static void run_bio(struct work_struct *work) +{ + struct delayed_work *dw = container_of(work, typeof(*dw), work); + struct bio *bio, *next; + + pr_info("%s\n", __func__); + spin_lock(&bio_lock); + bio_do_queue = 0; + bio = biolist; + biolist = NULL; + spin_unlock(&bio_lock); + + while (bio) { + next = bio->bi_next; + bio->bi_next = NULL; + bio_endio(bio); + bio = next; + } + kfree(dw); +} + +void nfit_test_inject_bio_delay(int sec) +{ + struct delayed_work *dw = kzalloc(sizeof(*dw), GFP_KERNEL); + + spin_lock(&bio_lock); + if (!bio_do_queue) { + pr_info("%s: %d seconds\n", __func__, sec); + INIT_DELAYED_WORK(dw, run_bio); + bio_do_queue = 1; + schedule_delayed_work(dw, sec * HZ); + dw = NULL; + } + spin_unlock(&bio_lock); +} +EXPORT_SYMBOL_GPL(nfit_test_inject_bio_delay); + +void __wrap_bio_endio(struct bio *bio) +{ + int did_q = 0; + + spin_lock(&bio_lock); + if (bio_do_queue) { + bio->bi_next = biolist; + biolist = bio; + did_q = 1; + } + spin_unlock(&bio_lock); + + if (did_q) + return; + + bio_endio(bio); +} +EXPORT_SYMBOL_GPL(__wrap_bio_endio); + MODULE_LICENSE("GPL v2"); diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index bef419d4266d..2c871c8b4a56 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -656,6 +656,39 @@ static const struct attribute_group *nfit_test_dimm_attribute_groups[] = { NULL, }; +static ssize_t bio_delay_show(struct device_driver *drv, char *buf) +{ + return sprintf(buf, "0\n"); +} + +static ssize_t bio_delay_store(struct device_driver *drv, const char *buf, + size_t count) +{ + unsigned long delay; + int rc = kstrtoul(buf, 0, &delay); + + if (rc < 0) + return rc; + + nfit_test_inject_bio_delay(delay); + return count; +} +DRIVER_ATTR_RW(bio_delay); + +static struct attribute *nfit_test_driver_attributes[] = { + &driver_attr_bio_delay.attr, + NULL, +}; + +static struct attribute_group nfit_test_driver_attribute_group = { + .attrs = nfit_test_driver_attributes, +}; + +static const struct attribute_group *nfit_test_driver_attribute_groups[] = { + &nfit_test_driver_attribute_group, + NULL, +}; + static int nfit_test0_alloc(struct nfit_test *t) { size_t nfit_size = sizeof(struct acpi_nfit_system_address) * NUM_SPA @@ -1905,6 +1938,7 @@ static struct platform_driver nfit_test_driver = { .remove = nfit_test_remove, .driver = { .name = KBUILD_MODNAME, + .groups = nfit_test_driver_attribute_groups, }, .id_table = nfit_test_id, }; diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h index d3d63dd5ed38..0d818d2adaf7 100644 --- a/tools/testing/nvdimm/test/nfit_test.h +++ b/tools/testing/nvdimm/test/nfit_test.h @@ -46,4 +46,5 @@ void nfit_test_setup(nfit_test_lookup_fn lookup, nf
[PATCH v3 09/13] IB/core: disable memory registration of fileystem-dax vmas
Until there is a solution to the dma-to-dax vs truncate problem it is not safe to allow RDMA to create long standing memory registrations against filesytem-dax vmas. Device-dax vmas do not have this problem and are explicitly allowed. This is temporary until a "memory registration with layout-lease" mechanism can be implemented, and is limited to non-ODP (On Demand Paging) capable RDMA devices. Cc: Sean Hefty Cc: Doug Ledford Cc: Hal Rosenstock Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Ross Zwisler Cc: Jason Gunthorpe Cc: Signed-off-by: Dan Williams --- drivers/infiniband/core/umem.c | 49 +++- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 21e60b1e2ff4..c30d286c1f24 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -147,19 +147,21 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem->hugetlb = 1; page_list = (struct page **) __get_free_page(GFP_KERNEL); - if (!page_list) { - put_pid(umem->pid); - kfree(umem); - return ERR_PTR(-ENOMEM); - } + if (!page_list) + goto err_pagelist; /* -* if we can't alloc the vma_list, it's not so bad; -* just assume the memory is not hugetlb memory +* If DAX is enabled we need the vma to protect against +* registering filesystem-dax memory. Otherwise we can tolerate +* a failure to allocate the vma_list and just assume that all +* vmas are not hugetlb-vmas. */ vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL); - if (!vma_list) + if (!vma_list) { + if (IS_ENABLED(CONFIG_FS_DAX)) + goto err_vmalist; umem->hugetlb = 0; + } npages = ib_umem_num_pages(umem); @@ -199,15 +201,34 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (ret < 0) goto out; - umem->npages += ret; cur_base += ret * PAGE_SIZE; npages -= ret; for_each_sg(sg_list_start, sg, ret, i) { - if (vma_list && !is_vm_hugetlb_page(vma_list[i])) - umem->hugetlb = 0; + struct vm_area_struct *vma; + struct inode *inode; sg_set_page(sg, page_list[i], PAGE_SIZE, 0); + umem->npages++; + + if (!vma_list) + continue; + vma = vma_list[i]; + + if (!is_vm_hugetlb_page(vma)) + umem->hugetlb = 0; + + if (!vma_is_dax(vma)) + continue; + + /* device-dax is safe for rdma... */ + inode = file_inode(vma->vm_file); + if (inode->i_mode == S_IFCHR) + continue; + + /* ...filesystem-dax is not. */ + ret = -EOPNOTSUPP; + goto out; } /* preparing for next loop */ @@ -242,6 +263,12 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, free_page((unsigned long) page_list); return ret < 0 ? ERR_PTR(ret) : umem; +err_vmalist: + free_page((unsigned long) page_list); +err_pagelist: + put_pid(umem->pid); + kfree(umem); + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(ib_umem_get); ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH v3 07/13] dax: warn if dma collides with truncate
Catch cases where truncate encounters pages that are still under active dma. This warning is a canary for potential data corruption as truncated blocks could be allocated to a new file while the device is still perform i/o. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Ross Zwisler Signed-off-by: Dan Williams --- fs/dax.c | 33 + 1 file changed, 33 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index ac6497dcfebd..b03f547b36e7 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -437,6 +437,38 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, return entry; } +static unsigned long dax_entry_size(void *entry) +{ + if (dax_is_zero_entry(entry)) + return 0; + else if (dax_is_pmd_entry(entry)) + return HPAGE_SIZE; + else + return PAGE_SIZE; +} + +static void dax_check_truncate(void *entry) +{ + unsigned long pfn = dax_radix_pfn(entry); + unsigned long size = dax_entry_size(entry); + unsigned long end_pfn; + + if (!size) + return; + end_pfn = pfn + size / PAGE_SIZE; + for (; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + /* +* devmap pages are idle when their count is 1 and the +* only path that increases their count is +* get_user_pages(). +*/ + WARN_ONCE(page_ref_count(page) > 1, + "dax-dma truncate collision\n"); + } +} + static int __dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index, bool trunc) { @@ -452,6 +484,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) goto out; + dax_check_truncate(entry); radix_tree_delete(page_tree, index); mapping->nrexceptional--; ret = 1; ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH v3 12/13] dax: handle truncate of dma-busy pages
get_user_pages() pins file backed memory pages for access by dma devices. However, it only pins the memory pages not the page-to-file offset association. If a file is truncated the pages are mapped out of the file and dma may continue indefinitely into a page that is owned by a device driver. This breaks coherency of the file vs dma, but the assumption is that if userspace wants the file-space truncated it does not matter what data is inbound from the device, it is not relevant anymore. The assumptions of the truncate-page-cache model are broken by DAX where the target DMA page *is* the filesystem block. Leaving the page pinned for DMA, but truncating the file block out of the file, means that the filesytem is free to reallocate a block under active DMA to another file! Here are some possible options for fixing this situation ('truncate' and 'fallocate(punch hole)' are synonymous below): 1/ Fail truncate while any file blocks might be under dma 2/ Block (sleep-wait) truncate while any file blocks might be under dma 3/ Remap file blocks to a "lost+found"-like file-inode where dma can continue and we might see what inbound data from DMA was mapped out of the original file. Blocks in this file could be freed back to the filesystem when dma eventually ends. 4/ Disable dax until option 3 or another long term solution has been implemented. However, filesystem-dax is still marked experimental for concerns like this. Option 1 will throw failures where userspace has never expected them before, option 2 might hang the truncating process indefinitely, and option 3 requires per filesystem enabling to remap blocks from one inode to another. Option 2 is implemented in this patch for the DAX path with the expectation that non-transient users of get_user_pages() (RDMA) are disallowed from setting up dax mappings and that the potential delay introduced to the truncate path is acceptable compared to the response time of the page cache case. This can only be seen as a stop-gap until we can solve the problem of safely sequestering unallocated filesystem blocks under active dma. The solution introduces a new FL_ALLOCATED lease to pin the allocated blocks in a dax file while dma might be accessing them. It behaves identically to an FL_LAYOUT lease save for the fact that it is immediately sheduled to be reaped, and that the only path that waits for its removal is the truncate path. We can not reuse FL_LAYOUT directly since that would deadlock in the case where userspace did a direct-I/O operation with a target buffer backed by an mmap range of the same file. Credit / inspiration for option 3 goes to Dave Hansen, who proposed something similar as an alternative way to solve the problem that MAP_DIRECT was trying to solve. Cc: Jan Kara Cc: Jeff Moyer Cc: Dave Chinner Cc: Matthew Wilcox Cc: Alexander Viro Cc: "Darrick J. Wong" Cc: Ross Zwisler Cc: Jeff Layton Cc: "J. Bruce Fields" Cc: Dave Hansen Reported-by: Christoph Hellwig Signed-off-by: Dan Williams --- fs/Kconfig |1 fs/dax.c| 188 +++ fs/locks.c | 17 - include/linux/dax.h | 23 ++ include/linux/fs.h | 22 +- mm/gup.c| 27 ++- 6 files changed, 268 insertions(+), 10 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index 7aee6d699fd6..a7b31a96a753 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -37,6 +37,7 @@ source "fs/f2fs/Kconfig" config FS_DAX bool "Direct Access (DAX) support" depends on MMU + depends on FILE_LOCKING depends on !(ARM || MIPS || SPARC) select FS_IOMAP select DAX diff --git a/fs/dax.c b/fs/dax.c index b03f547b36e7..e0a3958fc5f2 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1481,3 +1482,190 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, } } EXPORT_SYMBOL_GPL(dax_iomap_fault); + +enum dax_lease_flags { + DAX_LEASE_PAGES, + DAX_LEASE_BREAK, +}; + +struct dax_lease { + struct page **dl_pages; + unsigned long dl_nr_pages; + unsigned long dl_state; + struct file *dl_file; + atomic_t dl_count; + /* +* Once the lease is taken and the pages have references we +* start the reap_work to poll for lease release while acquiring +* fs locks that synchronize with truncate. So, either reap_work +* cleans up the dax_lease instances or truncate itself. +* +* The break_work sleepily polls for DMA completion and then +* unlocks/removes the lease. +*/ + struct delayed_work dl_reap_work; + struct delayed_work dl_break_work; +}; + +static void put_dax_lease(struct dax_lease *dl) +{ + if (atomic_dec_and_test(&dl->dl_count)) { + fput(dl->dl_file); + kfree(dl
[PATCH v3 06/13] dax: store pfns in the radix
In preparation for examining the busy state of dax pages in the truncate path, switch from sectors to pfns in the radix. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Ross Zwisler Signed-off-by: Dan Williams --- fs/dax.c | 75 +++--- 1 file changed, 28 insertions(+), 47 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index f001d8c72a06..ac6497dcfebd 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -72,16 +72,15 @@ fs_initcall(init_dax_wait_table); #define RADIX_DAX_ZERO_PAGE(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) #define RADIX_DAX_EMPTY(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) -static unsigned long dax_radix_sector(void *entry) +static unsigned long dax_radix_pfn(void *entry) { return (unsigned long)entry >> RADIX_DAX_SHIFT; } -static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) +static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) { return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | - ((unsigned long)sector << RADIX_DAX_SHIFT) | - RADIX_DAX_ENTRY_LOCK); + (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK); } static unsigned int dax_radix_order(void *entry) @@ -525,12 +524,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, */ static void *dax_insert_mapping_entry(struct address_space *mapping, struct vm_fault *vmf, - void *entry, sector_t sector, + void *entry, pfn_t pfn_t, unsigned long flags) { struct radix_tree_root *page_tree = &mapping->page_tree; - void *new_entry; + unsigned long pfn = pfn_t_to_pfn(pfn_t); pgoff_t index = vmf->pgoff; + void *new_entry; if (vmf->flags & FAULT_FLAG_WRITE) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -547,7 +547,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, } spin_lock_irq(&mapping->tree_lock); - new_entry = dax_radix_locked_entry(sector, flags); + new_entry = dax_radix_locked_entry(pfn, flags); if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* @@ -653,17 +653,14 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, i_mmap_unlock_read(mapping); } -static int dax_writeback_one(struct block_device *bdev, - struct dax_device *dax_dev, struct address_space *mapping, - pgoff_t index, void *entry) +static int dax_writeback_one(struct dax_device *dax_dev, + struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; - void *entry2, **slot, *kaddr; - long ret = 0, id; - sector_t sector; - pgoff_t pgoff; + void *entry2, **slot; + unsigned long pfn; + long ret = 0; size_t size; - pfn_t pfn; /* * A page got tagged dirty in DAX mapping? Something is seriously @@ -682,7 +679,7 @@ static int dax_writeback_one(struct block_device *bdev, * compare sectors as we must not bail out due to difference in lockbit * or entry type. */ - if (dax_radix_sector(entry2) != dax_radix_sector(entry)) + if (dax_radix_pfn(entry2) != dax_radix_pfn(entry)) goto put_unlocked; if (WARN_ON_ONCE(dax_is_empty_entry(entry) || dax_is_zero_entry(entry))) { @@ -712,29 +709,11 @@ static int dax_writeback_one(struct block_device *bdev, * 'entry'. This allows us to flush for PMD_SIZE and not have to * worry about partial PMD writebacks. */ - sector = dax_radix_sector(entry); + pfn = dax_radix_pfn(entry); size = PAGE_SIZE << dax_radix_order(entry); - id = dax_read_lock(); - ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); - if (ret) - goto dax_unlock; - - /* -* dax_direct_access() may sleep, so cannot hold tree_lock over -* its invocation. -*/ - ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn); - if (ret < 0) - goto dax_unlock; - - if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) { - ret = -EIO; - goto dax_unlock; - } - - dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); - dax_flush(dax_dev, kaddr, size); + dax_mapping_entry_mkclean(mapping, index, pfn); + dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -745,8 +724,6
[PATCH v3 13/13] xfs: wire up FL_ALLOCATED support
Before xfs can be sure that it is safe to truncate it needs to hold XFS_MMAP_LOCK_EXCL and flush any FL_ALLOCATED leases. Introduce xfs_break_allocated() modeled after xfs_break_layouts() for use in the file space deletion path. We also use a new address_space_operation for the fs/dax core to coordinate reaping these leases in the case where there is no active truncate process to reap them. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: "Darrick J. Wong" Cc: Ross Zwisler Cc: Jeff Layton Cc: "J. Bruce Fields" Signed-off-by: Dan Williams --- fs/xfs/xfs_aops.c | 24 fs/xfs/xfs_file.c | 64 fs/xfs/xfs_inode.h |1 + fs/xfs/xfs_ioctl.c |7 ++ 4 files changed, 86 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f18e5932aec4..00da08d0d6db 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1455,6 +1455,29 @@ xfs_vm_set_page_dirty( return newly_dirty; } +/* + * Reap any in-flight FL_ALLOCATE leases when the pages represented by + * that lease are no longer under dma. We hold XFS_MMAPLOCK_EXCL to + * synchronize with the file space deletion path that may be doing the + * same operation. + */ +static void +xfs_vm_dax_flush_dma( + struct inode*inode) +{ + uintiolock = XFS_MMAPLOCK_EXCL; + + /* +* try to catch cases where the inode dax mode was changed +* without first synchronizing leases +*/ + WARN_ON_ONCE(!IS_DAX(inode)); + + xfs_ilock(XFS_I(inode), iolock); + xfs_break_allocated(inode, &iolock); + xfs_iunlock(XFS_I(inode), iolock); +} + const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readpages = xfs_vm_readpages, @@ -1468,4 +1491,5 @@ const struct address_space_operations xfs_address_space_operations = { .migratepage= buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, + .dax_flush_dma = xfs_vm_dax_flush_dma, }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c6780743f8ec..5bc72f1da301 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -40,6 +40,7 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" +#include #include #include #include @@ -746,6 +747,39 @@ xfs_file_write_iter( return ret; } +/* + * DAX breaks the traditional truncate model that assumes in-flight DMA + * to a file-backed page can continue until the final put of the page + * regardless of that page's relationship to the file. In the case of + * DAX the page has 1:1 relationship with filesytem blocks. We need to + * hold off truncate while any DMA might be in-flight. This assumes that + * all DMA usage is transient, any non-transient usages of + * get_user_pages must be disallowed for DAX files. + * + * This also unlocks FL_LAYOUT leases. + */ +int +xfs_break_allocated( + struct inode*inode, + uint*iolock) +{ + struct xfs_inode*ip = XFS_I(inode); + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL + | XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)); + + while ((error = break_allocated(inode, false) == -EWOULDBLOCK)) { + xfs_iunlock(ip, *iolock); + error = break_allocated(inode, true); + *iolock &= ~XFS_MMAPLOCK_SHARED|XFS_IOLOCK_SHARED; + *iolock |= XFS_MMAPLOCK_EXCL|XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + } + + return error; +} + #defineXFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ @@ -762,7 +796,7 @@ xfs_file_fallocate( struct xfs_inode*ip = XFS_I(inode); longerror; enum xfs_prealloc_flags flags = 0; - uintiolock = XFS_IOLOCK_EXCL; + uintiolock = XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL; loff_t new_size = 0; booldo_file_insert = 0; @@ -772,13 +806,10 @@ xfs_file_fallocate( return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_allocated(inode, &iolock); if (error) goto out_unlock; - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; - if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -1136,6 +1167,28 @@ xfs_file_mmap( return 0; }
[PATCH v3 11/13] fs: use smp_load_acquire in break_{layout,lease}
Commit 128a37852234 "fs: fix data races on inode->i_flctx" converted checks of inode->i_flctx to use smp_load_acquire(), but it did not convert break_layout(). smp_load_acquire() includes a READ_ONCE(). There should be no functional difference since __break_lease repeats the sequence, but this is a clean up to unify all ->i_flctx lookups on a common pattern. Cc: Christoph Hellwig Cc: Alexander Viro Cc: Ross Zwisler Cc: Jeff Layton Cc: "J. Bruce Fields" Signed-off-by: Dan Williams --- include/linux/fs.h | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 13dab191a23e..eace2c5396a7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2281,8 +2281,9 @@ static inline int break_lease(struct inode *inode, unsigned int mode) * could end up racing with tasks trying to set a new lease on this * file. */ - smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx); + + if (ctx && !list_empty_careful(&ctx->flc_lease)) return __break_lease(inode, mode, FL_LEASE); return 0; } @@ -2325,8 +2326,9 @@ static inline int break_deleg_wait(struct inode **delegated_inode) static inline int break_layout(struct inode *inode, bool wait) { - smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx); + + if (ctx && !list_empty_careful(&ctx->flc_lease)) return __break_lease(inode, wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, FL_LAYOUT); ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
[PATCH 1/2] dm log writes: Add support for inline data buffers
Currently dm-log-writes supports writing filesystem data via BIOs, and writing internal metadata from a flat buffer via write_metadata(). For DAX writes, though, we won't have a BIO, but will instead have an iterator that we'll want to use to fill a flat data buffer. So, create write_inline_data() which allows us to write filesystem data using a flat buffer as a source, and wire it up in log_one_block(). Signed-off-by: Ross Zwisler --- drivers/md/dm-log-writes.c | 90 +++--- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 8b80a9c..c65f9d1 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -246,27 +246,109 @@ static int write_metadata(struct log_writes_c *lc, void *entry, return -1; } +static int write_inline_data(struct log_writes_c *lc, void *entry, + size_t entrylen, void *data, size_t datalen, + sector_t sector) +{ + int num_pages, bio_pages, pg_datalen, pg_sectorlen, i; + struct page *page; + struct bio *bio; + size_t ret; + void *ptr; + + while (datalen) { + num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT; + bio_pages = min(num_pages, BIO_MAX_PAGES); + + atomic_inc(&lc->io_blocks); + + bio = bio_alloc(GFP_KERNEL, bio_pages); + if (!bio) { + DMERR("Couldn't alloc inline data bio"); + goto error; + } + + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, lc->logdev->bdev); + bio->bi_end_io = log_end_io; + bio->bi_private = lc; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + for (i = 0; i < bio_pages; i++) { + pg_datalen = min(datalen, PAGE_SIZE); + pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize); + + page = alloc_page(GFP_KERNEL); + if (!page) { + DMERR("Couldn't alloc inline data page"); + goto error_bio; + } + + ptr = kmap_atomic(page); + memcpy(ptr, data, pg_datalen); + if (pg_sectorlen > pg_datalen) + memset(ptr + pg_datalen, 0, + pg_sectorlen - pg_datalen); + kunmap_atomic(ptr); + + ret = bio_add_page(bio, page, pg_sectorlen, 0); + if (ret != pg_sectorlen) { + DMERR("Couldn't add page of inline data"); + __free_page(page); + goto error_bio; + } + + datalen -= pg_datalen; + data+= pg_datalen; + } + submit_bio(bio); + + sector += bio_pages * PAGE_SECTORS; + } + return 0; +error_bio: + bio_free_pages(bio); + bio_put(bio); +error: + put_io_block(lc); + return -1; +} + static int log_one_block(struct log_writes_c *lc, struct pending_block *block, sector_t sector) { struct bio *bio; struct log_write_entry entry; - size_t ret; + size_t metadlen, ret; int i; entry.sector = cpu_to_le64(block->sector); entry.nr_sectors = cpu_to_le64(block->nr_sectors); entry.flags = cpu_to_le64(block->flags); entry.data_len = cpu_to_le64(block->datalen); - if (write_metadata(lc, &entry, sizeof(entry), block->data, - block->datalen, sector)) { + + metadlen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0; + if (write_metadata(lc, &entry, sizeof(entry), block->data, metadlen, + sector)) { free_pending_block(lc, block); return -1; } + sector += dev_to_bio_sectors(lc, 1); + + if (block->datalen && metadlen == 0) { + if (write_inline_data(lc, &entry, sizeof(entry), block->data, + block->datalen, sector)) { + free_pending_block(lc, block); + return -1; + } + /* we don't support both inline data & bio data */ + goto out; + } + if (!block->vec_cnt) goto out; - sector += dev_to_bio_sectors(lc, 1); atomic_inc(&lc->io_blocks); bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES)); -- 2.9.5 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailma
[PATCH 2/2] dm log writes: add support for DAX
Now that we have the ability log filesystem writes using a flat buffer, add support for DAX. Unfortunately we can't easily track data that has been written via mmap() now that the dax_flush() abstraction was removed by this commit: commit c3ca015fab6d ("dax: remove the pmem_dax_ops->flush abstraction") Otherwise we could just treat each flush as a big write, and store the data that is being synced to media. It may be worthwhile to add the dax_flush() entry point back, just as a notifier so we can do this logging. The motivation for this support is the need for an xfstest that can test the new MAP_SYNC DAX flag. By logging the filesystem activity with dm-log-writes we can show that the MAP_SYNC page faults are writing out their metadata as they happen, instead of requiring an explicit msync/fsync. Signed-off-by: Ross Zwisler --- Here's a link to Jan's latest MAP_SYNC set, which can be used for the fstest: https://www.spinics.net/lists/linux-xfs/msg11852.html MAP_SYNC is not needed for basic DAX+dm-log-writes functionality. --- drivers/md/dm-log-writes.c | 90 +- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index c65f9d1..6a8d352 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -10,9 +10,11 @@ #include #include #include +#include #include #include #include +#include #define DM_MSG_PREFIX "log-writes" @@ -609,6 +611,50 @@ static int log_mark(struct log_writes_c *lc, char *data) return 0; } +static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes, + struct iov_iter *i) +{ + struct pending_block *block; + + if (!bytes) + return 0; + + block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); + if (!block) { + DMERR("Error allocating dax pending block"); + return -ENOMEM; + } + + block->data = kzalloc(bytes, GFP_KERNEL); + if (!block->data) { + DMERR("Error allocating dax data space"); + kfree(block); + return -ENOMEM; + } + + /* write data provided via the iterator */ + if (!copy_from_iter(block->data, bytes, i)) { + DMERR("Error copying dax data"); + kfree(block->data); + kfree(block); + return -EIO; + } + + /* rewind the iterator so that the block driver can use it */ + iov_iter_revert(i, bytes); + + block->datalen = bytes; + block->sector = bio_to_dev_sectors(lc, sector); + block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift; + + atomic_inc(&lc->pending_blocks); + spin_lock_irq(&lc->blocks_lock); + list_add_tail(&block->list, &lc->unflushed_blocks); + spin_unlock_irq(&lc->blocks_lock); + wake_up_process(lc->log_kthread); + return 0; +} + static void log_writes_dtr(struct dm_target *ti) { struct log_writes_c *lc = ti->private; @@ -874,9 +920,49 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit limits->io_min = limits->physical_block_size; } +static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct log_writes_c *lc = ti->private; + struct block_device *bdev = lc->dev->bdev; + struct dax_device *dax_dev = lc->dev->dax_dev; + sector_t sector = pgoff * PAGE_SECTORS; + int ret; + + ret = bdev_dax_pgoff(bdev, sector, nr_pages * PAGE_SIZE, &pgoff); + if (ret) + return ret; + return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); +} + +static size_t log_writes_dax_copy_from_iter(struct dm_target *ti, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + struct log_writes_c *lc = ti->private; + struct block_device *bdev = lc->dev->bdev; + struct dax_device *dax_dev = lc->dev->dax_dev; + sector_t sector = pgoff * PAGE_SECTORS; + int err; + + if (bdev_dax_pgoff(bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) + return 0; + + /* Don't bother doing anything if logging has been disabled */ + if (!lc->logging_enabled) + goto dax_copy; + + err = log_dax(lc, sector, bytes, i); + if (err) { + DMWARN("Error %d logging DAX write", err); + return 0; + } +dax_copy: + return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); +} + static struct target_type log_writes_target = { .name = "log-writes", - .version = {1, 0, 0}, + .version = {1, 0, 1}, .module = THIS_MODULE, .ctr= log_writes_ctr, .dtr= log_writes_dtr, @@ -887,6 +973,8 @@ static struct target_type log_writes_target = { .message = log_writes_messag
[fstests PATCH] generic: add test for DAX MAP_SYNC support
Add a test that exercises DAX's new MAP_SYNC flag. This test creates a file and writes to it via an mmap(), but never syncs via fsync/msync. This process is tracked via dm-log-writes, then replayed. If MAP_SYNC is working the dm-log-writes replay will show the test file with the same size that we wrote via the mmap() because each allocating page fault included an implicit metadata sync. If MAP_SYNC isn't working (which you can test by fiddling with the parameters to mmap()) the file will be smaller or missing entirely. Note that dm-log-writes doesn't track the data that we write via the mmap(), so we can't do any data integrity checking. We can only verify that the metadata writes for the page faults happened. Signed-off-by: Ross Zwisler --- For this test to run successfully you'll need both Jan's MAP_SYNC series: https://www.spinics.net/lists/linux-xfs/msg11852.html and my series adding DAX support to dm-log-writes: https://lists.01.org/pipermail/linux-nvdimm/2017-October/012972.html --- .gitignore| 1 + common/dmlogwrites| 1 - src/Makefile | 3 +- src/t_map_sync.c | 74 + tests/generic/466 | 77 +++ tests/generic/466.out | 3 ++ tests/generic/group | 1 + 7 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 src/t_map_sync.c create mode 100755 tests/generic/466 create mode 100644 tests/generic/466.out diff --git a/.gitignore b/.gitignore index 2014c08..9fc0695 100644 --- a/.gitignore +++ b/.gitignore @@ -119,6 +119,7 @@ /src/t_getcwd /src/t_holes /src/t_immutable +/src/t_map_sync /src/t_mmap_cow_race /src/t_mmap_dio /src/t_mmap_fallocate diff --git a/common/dmlogwrites b/common/dmlogwrites index 247c744..5b57df9 100644 --- a/common/dmlogwrites +++ b/common/dmlogwrites @@ -23,7 +23,6 @@ _require_log_writes() [ -z "$LOGWRITES_DEV" -o ! -b "$LOGWRITES_DEV" ] && \ _notrun "This test requires a valid \$LOGWRITES_DEV" - _exclude_scratch_mount_option dax _require_dm_target log-writes _require_test_program "log-writes/replay-log" } diff --git a/src/Makefile b/src/Makefile index 3eb25b1..af7e7e9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -13,7 +13,8 @@ TARGETS = dirstress fill fill2 getpagesize holes lstat64 \ multi_open_unlink dmiperf unwritten_sync genhashnames t_holes \ t_mmap_writev t_truncate_cmtime dirhash_collide t_rename_overwrite \ holetest t_truncate_self t_mmap_dio af_unix t_mmap_stale_pmd \ - t_mmap_cow_race t_mmap_fallocate fsync-err t_mmap_write_ro + t_mmap_cow_race t_mmap_fallocate fsync-err t_mmap_write_ro \ + t_map_sync LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \ preallo_rw_pattern_writer ftrunc trunc fs_perms testx looptest \ diff --git a/src/t_map_sync.c b/src/t_map_sync.c new file mode 100644 index 000..8190f3c --- /dev/null +++ b/src/t_map_sync.c @@ -0,0 +1,74 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MiB(a) ((a)*1024*1024) + +/* + * These two defines were added to the kernel via commits entitled + * "mm: Define MAP_SYNC and VM_SYNC flags" and + * "mm: introduce MAP_SHARED_VALIDATE, a mechanism to safely define new mmap + * flags", respectively. + */ +#define MAP_SYNC 0x8 +#define MAP_SHARED_VALIDATE 0x3 + +void err_exit(char *op) +{ + fprintf(stderr, "%s: %s\n", op, strerror(errno)); + exit(1); +} + +int main(int argc, char *argv[]) +{ + int page_size = getpagesize(); + int len = MiB(1); + int i, fd, err; + char *data; + + if (argc < 2) { + printf("Usage: %s \n", basename(argv[0])); + exit(0); + } + + fd = open(argv[1], O_RDWR|O_CREAT, S_IRUSR|S_IWUSR); + if (fd < 0) + err_exit("fd"); + + ftruncate(fd, 0); + ftruncate(fd, len); + + data = mmap(NULL, len, PROT_READ|PROT_WRITE, + MAP_SHARED_VALIDATE|MAP_SYNC, fd, 0); + if (data == MAP_FAILED) + err_exit("mmap"); + + /* +* We intentionally don't sync 'fd' manually. If MAP_SYNC is working +* these allocating page faults will cause the filesystem to sync its +* metadata so that when we replay the dm-log-writes log the test file +* will be 1 MiB in size. +* +* dm-log-writes doesn't track the data that we write via the mmap(), +* so we can't check that, we can only verify that the metadata writes +* happened. +*/ + for (i = 0; i < len; i+=page_size) + data[i] = 0xff; + + err = munmap(data, len); + if (err < 0) + err_exit("munmap"); + + err = close(fd); + if (err < 0) + err_exit("close"); + + return 0; +} diff --git a/tests/generic/466 b/tests/ge
转发:如何解决销*售【常见问题】
如何解决【常见问题】 l 业绩压力大,但要提升业绩又觉得无从下手,怎么办? l 报价后客户就没有回应了,怎么办? l 客户拿着我们的报价与别人比,说我们的价格贵,怎么办? l 与客户联络一段时间了,但客户一直不下单,怎么办? l 提供样品给客户测试后,客户就不理我们了,怎么办? l 开发陌生客户时,有没有好的方法减少电话或邮件的拒绝率? l 收到客户的询盘后,认真分析过客户的需求并回复了,但客户反馈的很少,怎么办? l 如何把老客户的满意度转化为销量? 详细 内容 参阅 附件 大纲 杭瘩 2017-10-20/14:08:34 ___ Linux-nvdimm mailing list Linux-nvdimm@lists.01.org https://lists.01.org/mailman/listinfo/linux-nvdimm
Re: [fstests PATCH] generic: add test for DAX MAP_SYNC support
On Fri, Oct 20, 2017 at 8:29 AM, Ross Zwisler wrote: > Add a test that exercises DAX's new MAP_SYNC flag. > > This test creates a file and writes to it via an mmap(), but never syncs > via fsync/msync. This process is tracked via dm-log-writes, then replayed. > > If MAP_SYNC is working the dm-log-writes replay will show the test file > with the same size that we wrote via the mmap() because each allocating > page fault included an implicit metadata sync. If MAP_SYNC isn't working > (which you can test by fiddling with the parameters to mmap()) the file > will be smaller or missing entirely. > > Note that dm-log-writes doesn't track the data that we write via the > mmap(), so we can't do any data integrity checking. We can only verify > that the metadata writes for the page faults happened. > > Signed-off-by: Ross Zwisler Looks good. some nit picking... > --- > > For this test to run successfully you'll need both Jan's MAP_SYNC series: > > https://www.spinics.net/lists/linux-xfs/msg11852.html > > and my series adding DAX support to dm-log-writes: > > https://lists.01.org/pipermail/linux-nvdimm/2017-October/012972.html > > --- > .gitignore| 1 + > common/dmlogwrites| 1 - > src/Makefile | 3 +- > src/t_map_sync.c | 74 + > tests/generic/466 | 77 > +++ > tests/generic/466.out | 3 ++ > tests/generic/group | 1 + > 7 files changed, 158 insertions(+), 2 deletions(-) > create mode 100644 src/t_map_sync.c > create mode 100755 tests/generic/466 > create mode 100644 tests/generic/466.out > > diff --git a/.gitignore b/.gitignore > index 2014c08..9fc0695 100644 > --- a/.gitignore > +++ b/.gitignore > @@ -119,6 +119,7 @@ > /src/t_getcwd > /src/t_holes > /src/t_immutable > +/src/t_map_sync > /src/t_mmap_cow_race > /src/t_mmap_dio > /src/t_mmap_fallocate > diff --git a/common/dmlogwrites b/common/dmlogwrites > index 247c744..5b57df9 100644 > --- a/common/dmlogwrites > +++ b/common/dmlogwrites > @@ -23,7 +23,6 @@ _require_log_writes() > [ -z "$LOGWRITES_DEV" -o ! -b "$LOGWRITES_DEV" ] && \ > _notrun "This test requires a valid \$LOGWRITES_DEV" > > - _exclude_scratch_mount_option dax > _require_dm_target log-writes > _require_test_program "log-writes/replay-log" > } > diff --git a/src/Makefile b/src/Makefile > index 3eb25b1..af7e7e9 100644 > --- a/src/Makefile > +++ b/src/Makefile > @@ -13,7 +13,8 @@ TARGETS = dirstress fill fill2 getpagesize holes lstat64 \ > multi_open_unlink dmiperf unwritten_sync genhashnames t_holes \ > t_mmap_writev t_truncate_cmtime dirhash_collide t_rename_overwrite \ > holetest t_truncate_self t_mmap_dio af_unix t_mmap_stale_pmd \ > - t_mmap_cow_race t_mmap_fallocate fsync-err t_mmap_write_ro > + t_mmap_cow_race t_mmap_fallocate fsync-err t_mmap_write_ro \ > + t_map_sync > > LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \ > preallo_rw_pattern_writer ftrunc trunc fs_perms testx looptest \ > diff --git a/src/t_map_sync.c b/src/t_map_sync.c > new file mode 100644 > index 000..8190f3c > --- /dev/null > +++ b/src/t_map_sync.c > @@ -0,0 +1,74 @@ > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#define MiB(a) ((a)*1024*1024) > + > +/* > + * These two defines were added to the kernel via commits entitled > + * "mm: Define MAP_SYNC and VM_SYNC flags" and > + * "mm: introduce MAP_SHARED_VALIDATE, a mechanism to safely define new mmap > + * flags", respectively. #ifndef? > + */ > +#define MAP_SYNC 0x8 > +#define MAP_SHARED_VALIDATE 0x3 > + > +void err_exit(char *op) > +{ > + fprintf(stderr, "%s: %s\n", op, strerror(errno)); > + exit(1); > +} > + > +int main(int argc, char *argv[]) > +{ > + int page_size = getpagesize(); > + int len = MiB(1); > + int i, fd, err; > + char *data; > + > + if (argc < 2) { > + printf("Usage: %s \n", basename(argv[0])); > + exit(0); > + } > + > + fd = open(argv[1], O_RDWR|O_CREAT, S_IRUSR|S_IWUSR); > + if (fd < 0) > + err_exit("fd"); > + > + ftruncate(fd, 0); O_TRUNC? > + ftruncate(fd, len); > + > + data = mmap(NULL, len, PROT_READ|PROT_WRITE, > + MAP_SHARED_VALIDATE|MAP_SYNC, fd, 0); > + if (data == MAP_FAILED) > + err_exit("mmap"); > + > + /* > +* We intentionally don't sync 'fd' manually. If MAP_SYNC is working > +* these allocating page faults will cause the filesystem to sync its > +* metadata so that when we replay the dm-log-writes log the test file > +* will be 1 MiB in size. > +* > +* dm-log-writes doesn't track the data that we write via the mmap(), > +* so we can't check that, we can o