On Mon, 2015-11-23 at 12:53 -0800, Dan Williams wrote: > On Mon, Nov 23, 2015 at 12:04 PM, Toshi Kani <[email protected]> wrote: > > The following oops was observed when mmap() with MAP_POPULATE > > pre-faulted pmd mappings of a DAX file. follow_trans_huge_pmd() > > expects that a target address has a struct page. > > > > BUG: unable to handle kernel paging request at ffffea0012220000 > > follow_trans_huge_pmd+0xba/0x390 > > follow_page_mask+0x33d/0x420 > > __get_user_pages+0xdc/0x800 > > populate_vma_page_range+0xb5/0xe0 > > __mm_populate+0xc5/0x150 > > vm_mmap_pgoff+0xd5/0xe0 > > SyS_mmap_pgoff+0x1c1/0x290 > > SyS_mmap+0x1b/0x30 > > > > Fix it by making the PMD pre-fault handling consistent with PTE. > > After pre-faulted in faultin_page(), follow_page_mask() calls > > follow_trans_huge_pmd(), which is changed to call follow_pfn_pmd() > > for VM_PFNMAP or VM_MIXEDMAP. follow_pfn_pmd() handles FOLL_TOUCH > > and returns with -EEXIST. > > As of 4.4.-rc2 DAX pmd mappings are disabled. So we have time to do > something more comprehensive in 4.5.
Yes, I noticed during my testing that I could not use pmd... > > Reported-by: Mauricio Porto <[email protected]> > > Signed-off-by: Toshi Kani <[email protected]> > > Cc: Andrew Morton <[email protected]> > > Cc: Kirill A. Shutemov <[email protected]> > > Cc: Matthew Wilcox <[email protected]> > > Cc: Dan Williams <[email protected]> > > Cc: Ross Zwisler <[email protected]> > > --- > > mm/huge_memory.c | 34 ++++++++++++++++++++++++++++++++++ > > 1 file changed, 34 insertions(+) > > > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index d5b8920..f56e034 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > [..] > > @@ -1288,6 +1315,13 @@ struct page *follow_trans_huge_pmd(struct > > vm_area_struct *vma, > > if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) > > goto out; > > > > + /* pfn map does not have a struct page */ > > + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) { > > + ret = follow_pfn_pmd(vma, addr, pmd, flags); > > + page = ERR_PTR(ret); > > + goto out; > > + } > > + > > page = pmd_page(*pmd); > > VM_BUG_ON_PAGE(!PageHead(page), page); > > if (flags & FOLL_TOUCH) { > > I think it is already problematic that dax pmd mappings are getting > confused with transparent huge pages. We had the same issue with dax pte mapping [1], and this change extends the pfn map handling to pmd. So, this problem is not specific to pmd. [1] https://lkml.org/lkml/2015/6/23/181 > They're more closely related to > a hugetlbfs pmd mappings in that they are mapping an explicit > allocation. I have some pending patches to address this dax-pmd vs > hugetlb-pmd vs thp-pmd classification that I will post shortly. Not sure which way is better, but I am certainly interested in your changes. > By the way, I'm collecting DAX pmd regression tests [1], is this just > a simple crash upon using MAP_POPULATE? > > [1]: https://github.com/pmem/ndctl/blob/master/lib/test-dax-pmd.c Yes, this issue is easy to reproduce with MAP_POPULATE. In case it helps, attached are the test I used for testing the patches. Sorry, the code is messy since it was only intended for my internal use... - The test was originally written for the pte change [1] and comments in test.sh (ex. mlock fail, ok) reflect the results without the pte change. - For the pmd test, I modified test-mmap.c to call posix_memalign() before mmap(). By calling free(), the 2MB-aligned address from posix_memalign() can be used for mmap(). This keeps the mmap'd address aligned on 2MB. - I created test file(s) with dd (i.e. all blocks written) in my test. - The other infinite loop issue (fixed by my other patch) was found by the test case with option "-LMSr". Thanks, -Toshi
test.sh
Description: application/shellscript
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <string.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#define MB(a) ((a) * 1024UL * 1024UL)
static struct timeval start_tv, stop_tv;
// Calculate the difference between two time values.
void tvsub(struct timeval *tdiff, struct timeval *t1, struct timeval *t0)
{
tdiff->tv_sec = t1->tv_sec - t0->tv_sec;
tdiff->tv_usec = t1->tv_usec - t0->tv_usec;
if (tdiff->tv_usec < 0)
tdiff->tv_sec--, tdiff->tv_usec += 1000000;
}
// Start timing now.
void start()
{
(void) gettimeofday(&start_tv, (struct timezone *) 0);
}
// Stop timing and return real time in microseconds.
unsigned long long stop()
{
struct timeval tdiff;
(void) gettimeofday(&stop_tv, (struct timezone *) 0);
tvsub(&tdiff, &stop_tv, &start_tv);
return (tdiff.tv_sec * 1000000 + tdiff.tv_usec);
}
void test_write(unsigned long *p, size_t size)
{
int i;
unsigned long *wp, tmp;
unsigned long long timeval;
start();
for (i=0, wp=p; i<(size/sizeof(wp)); i++)
*wp++ = 1;
timeval = stop();
printf("Write: %10llu usec\n", timeval);
}
void test_read(unsigned long *p, size_t size)
{
int i;
unsigned long *wp, tmp;
unsigned long long timeval;
start();
for (i=0, wp=p; i<(size/sizeof(wp)); i++)
tmp = *wp++;
timeval = stop();
printf("Read : %10llu usec\n", timeval);
}
int main(int argc, char **argv)
{
int fd, i, opt, ret;
int oflags, mprot, mflags = 0;
int is_read_only = 0, is_mlock = 0, is_mlockall = 0;
int mlock_skip = 0, read_test = 0, write_test = 0;
void *mptr = NULL;
unsigned long *p;
struct stat stat;
size_t size, cpy_size;
const char *file_name = NULL;
while ((opt = getopt(argc, argv, "LRMSApsrw")) != -1) {
switch (opt) {
case 'L':
file_name = "/mnt/pmem0/4Gfile";
break;
case 'R':
printf("> mmap: read-only\n");
is_read_only = 1;
break;
case 'M':
printf("> mlock\n");
is_mlock = 1;
break;
case 'S':
printf("> mlock - skip first ite\n");
mlock_skip = 1;
break;
case 'A':
printf("> mlockall\n");
is_mlockall = 1;
break;
case 'p':
printf("> MAP_POPULATE\n");
mflags |= MAP_POPULATE;
break;
case 's':
printf("> MAP_SHARED\n");
mflags |= MAP_SHARED;
break;
case 'r':
printf("> read-test\n");
read_test = 1;
break;
case 'w':
printf("> write-test\n");
write_test = 1;
break;
}
}
if (!file_name) {
file_name = "/mnt/pmem1/32Kfile";
}
if (!(mflags & MAP_SHARED)) {
printf("> MAP_PRIVATE\n");
mflags |= MAP_PRIVATE;
}
if (is_read_only) {
oflags = O_RDONLY;
mprot = PROT_READ;
} else {
oflags = O_RDWR;
mprot = PROT_READ|PROT_WRITE;
}
fd = open(file_name, oflags);
if (fd == -1) {
perror("open failed");
exit(1);
}
ret = fstat(fd, &stat);
if (ret < 0) {
perror("fstat failed");
exit(1);
}
size = stat.st_size;
printf("> open %s size 0x%x flags 0x%x\n", file_name, size, oflags);
ret = posix_memalign(&mptr, MB(2), size);
if (ret ==0)
free(mptr);
printf("> mmap mprot 0x%x flags 0x%x\n", mprot, mflags);
p = mmap(mptr, size, mprot, mflags, fd, 0x0);
if (!p) {
perror("mmap failed");
exit(1);
}
if ((long unsigned)p & (MB(2)-1))
printf("> mmap: NOT 2MB aligned: 0x%p\n", p);
else
printf("> mmap: 2MB aligned: 0x%p\n", p);
#if 0 /* SIZE LIMIT */
if (size >= MB(2))
cpy_size = MB(32);
else
#endif
cpy_size = size;
for (i=0; i<3; i++) {
if (is_mlock && !mlock_skip) {
printf("> mlock 0x%p\n", p);
ret = mlock(p, size);
if (ret < 0) {
perror("mlock failed");
exit(1);
}
} else if (is_mlockall) {
printf("> mlockall\n");
ret = mlockall(MCL_CURRENT|MCL_FUTURE);
if (ret < 0) {
perror("mlockall failed");
exit(1);
}
}
printf("===== %d =====\n", i+1);
if (write_test)
test_write(p, cpy_size);
if (read_test)
test_read(p, cpy_size);
if (is_mlock && !mlock_skip) {
printf("> munlock 0x%p\n", p);
ret = munlock(p, size);
if (ret < 0) {
perror("munlock failed");
exit(1);
}
} else if (is_mlockall) {
printf("> munlockall\n");
ret = munlockall();
if (ret < 0) {
perror("munlockall failed");
exit(1);
}
}
/* skip, if requested, only the first iteration */
mlock_skip = 0;
}
printf("> munmap 0x%p\n", p);
munmap(p, size);
}

