Re: [PATCH v5 4/5] cramfs: add mmap support

2017-10-06 Thread Nicolas Pitre
On Fri, 6 Oct 2017, Christoph Hellwig wrote:

> > +   /* Don't map the last page if it contains some other data */
> > +   if (unlikely(pgoff + pages == max_pages)) {
> > +   unsigned int partial = offset_in_page(inode->i_size);
> > +   if (partial) {
> > +   char *data = sbi->linear_virt_addr + offset;
> > +   data += (max_pages - 1) * PAGE_SIZE + partial;
> > +   if (memchr_inv(data, 0, PAGE_SIZE - partial) != NULL) {
> > +   pr_debug("mmap: %s: last page is shared\n",
> > +file_dentry(file)->d_name.name);
> > +   pages--;
> > +   }
> > +   }
> > +   }
> 
> Why is pgoff + pages == max_pages marked unlikely?  Mapping the whole
> file seems like a perfectly normal and likely case to me..

In practice it is not. This is typically used for executables where a 
mmap() call covers the executable and read-only segments located at the 
beginning of a file. The end of the file is covered by a r/w mapping 
where .data is normally located and we can't direct-mmap that.

Whatever. I have no strong opinion here as this is hardly 
performance critical so it is removed.

> Also if this was my code I'd really prefer to move this into a helper:
> 
> static bool cramfs_mmap_last_page_is_shared(struct inode *inode, int offset)
> {
>   unsigned int partial = offset_in_page(inode->i_size);
>   char *data = CRAMFS_SB(inode->i_sb)->linear_virt_addr + offset +
>   (inode->i_size & PAGE_MASK);
> 
>   return memchr_inv(data + partial, 0, PAGE_SIZE - partial);
> }
> 
>   if (pgoff + pages == max_pages && offset_in_page(inode->i_size) &&
>   cramfs_mmap_last_page_is_shared(inode, offset))
>   pages--;
> 
> as that's much more readable and the function name provides a good
> documentation of what is going on.

OK.  The above got some details wrong but the idea is clear. Please see 
the new patch bleow.

> > +   if (pages != vma_pages(vma)) {
> 
> here is how I would turn this around:
> 
>   if (!pages)
>   goto done;
> 
>   if (pages == vma_pages(vma)) {
>   remap_pfn_range();
>   goto done;
>   }
> 
>   ...
>   for (i = 0; i < pages; i++) {
>   ...
>   vm_insert_mixed();
>   nr_mapped++;
>   }
> 
> 
> done:
>   pr_debug("mapped %d out ouf %d\n", ..);
>   if (pages != vma_pages(vma))
>   vma->vm_ops = _file_vm_ops;
>   return 0;
> }
> 
> In fact we probably could just set the vm_ops unconditionally, they
> just wouldn't be called, but that might be more confusing then helpful.

Well... For one thing generic_file_vm_ops is not exported to modules so 
now I simply call generic_file_readonly_mmap() at the beginning and let 
it validate the vma flags. That allowed for some simplifications at the 
end.

Personally I think the if-else form is clearer over an additional goto 
label, but I moved the more common case first as you did above. At this 
point I hope you'll indulge me.

Here's the latest version of patch 4/5:

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 6aa1d94ed8..e1b9192f23 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -15,7 +15,10 @@
 
 #include 
 #include 
+#include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -49,6 +52,7 @@ static inline struct cramfs_sb_info *CRAMFS_SB(struct 
super_block *sb)
 static const struct super_operations cramfs_ops;
 static const struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
+static const struct file_operations cramfs_physmem_fops;
 static const struct address_space_operations cramfs_aops;
 
 static DEFINE_MUTEX(read_mutex);
@@ -96,6 +100,10 @@ static struct inode *get_cramfs_inode(struct super_block 
*sb,
case S_IFREG:
inode->i_fop = _ro_fops;
inode->i_data.a_ops = _aops;
+   if (IS_ENABLED(CONFIG_CRAMFS_PHYSMEM) &&
+   CRAMFS_SB(sb)->flags & CRAMFS_FLAG_EXT_BLOCK_POINTERS &&
+   CRAMFS_SB(sb)->linear_phys_addr)
+   inode->i_fop = _physmem_fops;
break;
case S_IFDIR:
inode->i_op = _dir_inode_operations;
@@ -277,6 +285,207 @@ static void *cramfs_read(struct super_block *sb, unsigned 
int offset,
return NULL;
 }
 
+/*
+ * For a mapping to be possible, we need a range of uncompressed and
+ * contiguous blocks. Return the offset for the first block and number of
+ * valid blocks for which that is true, or zero otherwise.
+ */
+static u32 cramfs_get_block_range(struct inode *inode, u32 pgoff, u32 *pages)
+{
+   struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
+   int i;
+   u32 *blockptrs, first_block_addr;
+
+   /*
+* We can dereference memory directly 

Re: [PATCH v5 4/5] cramfs: add mmap support

2017-10-06 Thread Nicolas Pitre
On Fri, 6 Oct 2017, Christoph Hellwig wrote:

> > +   /* Don't map the last page if it contains some other data */
> > +   if (unlikely(pgoff + pages == max_pages)) {
> > +   unsigned int partial = offset_in_page(inode->i_size);
> > +   if (partial) {
> > +   char *data = sbi->linear_virt_addr + offset;
> > +   data += (max_pages - 1) * PAGE_SIZE + partial;
> > +   if (memchr_inv(data, 0, PAGE_SIZE - partial) != NULL) {
> > +   pr_debug("mmap: %s: last page is shared\n",
> > +file_dentry(file)->d_name.name);
> > +   pages--;
> > +   }
> > +   }
> > +   }
> 
> Why is pgoff + pages == max_pages marked unlikely?  Mapping the whole
> file seems like a perfectly normal and likely case to me..

In practice it is not. This is typically used for executables where a 
mmap() call covers the executable and read-only segments located at the 
beginning of a file. The end of the file is covered by a r/w mapping 
where .data is normally located and we can't direct-mmap that.

Whatever. I have no strong opinion here as this is hardly 
performance critical so it is removed.

> Also if this was my code I'd really prefer to move this into a helper:
> 
> static bool cramfs_mmap_last_page_is_shared(struct inode *inode, int offset)
> {
>   unsigned int partial = offset_in_page(inode->i_size);
>   char *data = CRAMFS_SB(inode->i_sb)->linear_virt_addr + offset +
>   (inode->i_size & PAGE_MASK);
> 
>   return memchr_inv(data + partial, 0, PAGE_SIZE - partial);
> }
> 
>   if (pgoff + pages == max_pages && offset_in_page(inode->i_size) &&
>   cramfs_mmap_last_page_is_shared(inode, offset))
>   pages--;
> 
> as that's much more readable and the function name provides a good
> documentation of what is going on.

OK.  The above got some details wrong but the idea is clear. Please see 
the new patch bleow.

> > +   if (pages != vma_pages(vma)) {
> 
> here is how I would turn this around:
> 
>   if (!pages)
>   goto done;
> 
>   if (pages == vma_pages(vma)) {
>   remap_pfn_range();
>   goto done;
>   }
> 
>   ...
>   for (i = 0; i < pages; i++) {
>   ...
>   vm_insert_mixed();
>   nr_mapped++;
>   }
> 
> 
> done:
>   pr_debug("mapped %d out ouf %d\n", ..);
>   if (pages != vma_pages(vma))
>   vma->vm_ops = _file_vm_ops;
>   return 0;
> }
> 
> In fact we probably could just set the vm_ops unconditionally, they
> just wouldn't be called, but that might be more confusing then helpful.

Well... For one thing generic_file_vm_ops is not exported to modules so 
now I simply call generic_file_readonly_mmap() at the beginning and let 
it validate the vma flags. That allowed for some simplifications at the 
end.

Personally I think the if-else form is clearer over an additional goto 
label, but I moved the more common case first as you did above. At this 
point I hope you'll indulge me.

Here's the latest version of patch 4/5:

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 6aa1d94ed8..e1b9192f23 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -15,7 +15,10 @@
 
 #include 
 #include 
+#include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -49,6 +52,7 @@ static inline struct cramfs_sb_info *CRAMFS_SB(struct 
super_block *sb)
 static const struct super_operations cramfs_ops;
 static const struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
+static const struct file_operations cramfs_physmem_fops;
 static const struct address_space_operations cramfs_aops;
 
 static DEFINE_MUTEX(read_mutex);
@@ -96,6 +100,10 @@ static struct inode *get_cramfs_inode(struct super_block 
*sb,
case S_IFREG:
inode->i_fop = _ro_fops;
inode->i_data.a_ops = _aops;
+   if (IS_ENABLED(CONFIG_CRAMFS_PHYSMEM) &&
+   CRAMFS_SB(sb)->flags & CRAMFS_FLAG_EXT_BLOCK_POINTERS &&
+   CRAMFS_SB(sb)->linear_phys_addr)
+   inode->i_fop = _physmem_fops;
break;
case S_IFDIR:
inode->i_op = _dir_inode_operations;
@@ -277,6 +285,207 @@ static void *cramfs_read(struct super_block *sb, unsigned 
int offset,
return NULL;
 }
 
+/*
+ * For a mapping to be possible, we need a range of uncompressed and
+ * contiguous blocks. Return the offset for the first block and number of
+ * valid blocks for which that is true, or zero otherwise.
+ */
+static u32 cramfs_get_block_range(struct inode *inode, u32 pgoff, u32 *pages)
+{
+   struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
+   int i;
+   u32 *blockptrs, first_block_addr;
+
+   /*
+* We can dereference memory directly 

Re: [PATCH v5 4/5] cramfs: add mmap support

2017-10-06 Thread Christoph Hellwig
> + /* Don't map the last page if it contains some other data */
> + if (unlikely(pgoff + pages == max_pages)) {
> + unsigned int partial = offset_in_page(inode->i_size);
> + if (partial) {
> + char *data = sbi->linear_virt_addr + offset;
> + data += (max_pages - 1) * PAGE_SIZE + partial;
> + if (memchr_inv(data, 0, PAGE_SIZE - partial) != NULL) {
> + pr_debug("mmap: %s: last page is shared\n",
> +  file_dentry(file)->d_name.name);
> + pages--;
> + }
> + }
> + }

Why is pgoff + pages == max_pages marked unlikely?  Mapping the whole
file seems like a perfectly normal and likely case to me..

Also if this was my code I'd really prefer to move this into a helper:

static bool cramfs_mmap_last_page_is_shared(struct inode *inode, int offset)
{
unsigned int partial = offset_in_page(inode->i_size);
char *data = CRAMFS_SB(inode->i_sb)->linear_virt_addr + offset +
(inode->i_size & PAGE_MASK);

return memchr_inv(data + partial, 0, PAGE_SIZE - partial);
}

if (pgoff + pages == max_pages && offset_in_page(inode->i_size) &&
cramfs_mmap_last_page_is_shared(inode, offset))
pages--;

as that's much more readable and the function name provides a good
documentation of what is going on.

> + if (pages != vma_pages(vma)) {

here is how I would turn this around:

if (!pages)
goto done;

if (pages == vma_pages(vma)) {
remap_pfn_range();
goto done;
}

...
for (i = 0; i < pages; i++) {
...
vm_insert_mixed();
nr_mapped++;
}


done:
pr_debug("mapped %d out ouf %d\n", ..);
if (pages != vma_pages(vma))
vma->vm_ops = _file_vm_ops;
return 0;
}

In fact we probably could just set the vm_ops unconditionally, they
just wouldn't be called, but that might be more confusing then helpful.


Re: [PATCH v5 4/5] cramfs: add mmap support

2017-10-06 Thread Christoph Hellwig
> + /* Don't map the last page if it contains some other data */
> + if (unlikely(pgoff + pages == max_pages)) {
> + unsigned int partial = offset_in_page(inode->i_size);
> + if (partial) {
> + char *data = sbi->linear_virt_addr + offset;
> + data += (max_pages - 1) * PAGE_SIZE + partial;
> + if (memchr_inv(data, 0, PAGE_SIZE - partial) != NULL) {
> + pr_debug("mmap: %s: last page is shared\n",
> +  file_dentry(file)->d_name.name);
> + pages--;
> + }
> + }
> + }

Why is pgoff + pages == max_pages marked unlikely?  Mapping the whole
file seems like a perfectly normal and likely case to me..

Also if this was my code I'd really prefer to move this into a helper:

static bool cramfs_mmap_last_page_is_shared(struct inode *inode, int offset)
{
unsigned int partial = offset_in_page(inode->i_size);
char *data = CRAMFS_SB(inode->i_sb)->linear_virt_addr + offset +
(inode->i_size & PAGE_MASK);

return memchr_inv(data + partial, 0, PAGE_SIZE - partial);
}

if (pgoff + pages == max_pages && offset_in_page(inode->i_size) &&
cramfs_mmap_last_page_is_shared(inode, offset))
pages--;

as that's much more readable and the function name provides a good
documentation of what is going on.

> + if (pages != vma_pages(vma)) {

here is how I would turn this around:

if (!pages)
goto done;

if (pages == vma_pages(vma)) {
remap_pfn_range();
goto done;
}

...
for (i = 0; i < pages; i++) {
...
vm_insert_mixed();
nr_mapped++;
}


done:
pr_debug("mapped %d out ouf %d\n", ..);
if (pages != vma_pages(vma))
vma->vm_ops = _file_vm_ops;
return 0;
}

In fact we probably could just set the vm_ops unconditionally, they
just wouldn't be called, but that might be more confusing then helpful.


[PATCH v5 4/5] cramfs: add mmap support

2017-10-05 Thread Nicolas Pitre
When cramfs_physmem is used then we have the opportunity to map files
directly from ROM, directly into user space, saving on RAM usage.
This gives us Execute-In-Place (XIP) support.

For a file to be mmap()-able, the map area has to correspond to a range
of uncompressed and contiguous blocks, and in the MMU case it also has
to be page aligned. A version of mkcramfs with appropriate support is
necessary to create such a filesystem image.

In the MMU case it may happen for a vma structure to extend beyond the
actual file size. This is notably the case in binfmt_elf.c:elf_map().
Or the file's last block is shared with other files and cannot be mapped
as is. Rather than refusing to mmap it, we do a "mixed" map and let the
regular fault handler populate the unmapped area with RAM-backed pages.
In practice the unmapped area is seldom accessed so page faults might
never occur before this area is discarded.

In the non-MMU case it is the get_unmapped_area method that is responsible
for providing the address where the actual data can be found. No mapping
is necessary of course.

Signed-off-by: Nicolas Pitre 
Tested-by: Chris Brandt 
---
 fs/cramfs/inode.c | 194 ++
 1 file changed, 194 insertions(+)

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 6aa1d94ed8..071ce1eb58 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -15,7 +15,10 @@
 
 #include 
 #include 
+#include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -49,6 +52,7 @@ static inline struct cramfs_sb_info *CRAMFS_SB(struct 
super_block *sb)
 static const struct super_operations cramfs_ops;
 static const struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
+static const struct file_operations cramfs_physmem_fops;
 static const struct address_space_operations cramfs_aops;
 
 static DEFINE_MUTEX(read_mutex);
@@ -96,6 +100,10 @@ static struct inode *get_cramfs_inode(struct super_block 
*sb,
case S_IFREG:
inode->i_fop = _ro_fops;
inode->i_data.a_ops = _aops;
+   if (IS_ENABLED(CONFIG_CRAMFS_PHYSMEM) &&
+   CRAMFS_SB(sb)->flags & CRAMFS_FLAG_EXT_BLOCK_POINTERS &&
+   CRAMFS_SB(sb)->linear_phys_addr)
+   inode->i_fop = _physmem_fops;
break;
case S_IFDIR:
inode->i_op = _dir_inode_operations;
@@ -277,6 +285,192 @@ static void *cramfs_read(struct super_block *sb, unsigned 
int offset,
return NULL;
 }
 
+/*
+ * For a mapping to be possible, we need a range of uncompressed and
+ * contiguous blocks. Return the offset for the first block and number of
+ * valid blocks for which that is true, or zero otherwise.
+ */
+static u32 cramfs_get_block_range(struct inode *inode, u32 pgoff, u32 *pages)
+{
+   struct super_block *sb = inode->i_sb;
+   struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+   int i;
+   u32 *blockptrs, first_block_addr;
+
+   /*
+* We can dereference memory directly here as this code may be
+* reached only when there is a direct filesystem image mapping
+* available in memory.
+*/
+   blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode) + pgoff * 4);
+   first_block_addr = blockptrs[0] & ~CRAMFS_BLK_FLAGS;
+   i = 0;
+   do {
+   u32 block_off = i * (PAGE_SIZE >> CRAMFS_BLK_DIRECT_PTR_SHIFT);
+   u32 expect = (first_block_addr + block_off) |
+CRAMFS_BLK_FLAG_DIRECT_PTR |
+CRAMFS_BLK_FLAG_UNCOMPRESSED;
+   if (blockptrs[i] != expect) {
+   pr_debug("range: block %d/%d got %#x expects %#x\n",
+pgoff+i, pgoff + *pages - 1,
+blockptrs[i], expect);
+   if (i == 0)
+   return 0;
+   break;
+   }
+   } while (++i < *pages);
+
+   *pages = i;
+   return first_block_addr << CRAMFS_BLK_DIRECT_PTR_SHIFT;
+}
+
+#ifdef CONFIG_MMU
+
+static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+   struct inode *inode = file_inode(file);
+   struct super_block *sb = inode->i_sb;
+   struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+   unsigned int pages, max_pages, offset;
+   unsigned long address, pgoff = vma->vm_pgoff;
+   char *bailout_reason;
+   int ret;
+
+   if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+   return -EINVAL;
+
+   /* Could COW work here? */
+   bailout_reason = "vma is writable";
+   if (vma->vm_flags & VM_WRITE)
+   goto bailout;
+
+   max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   bailout_reason = "beyond file limit";
+   if 

[PATCH v5 4/5] cramfs: add mmap support

2017-10-05 Thread Nicolas Pitre
When cramfs_physmem is used then we have the opportunity to map files
directly from ROM, directly into user space, saving on RAM usage.
This gives us Execute-In-Place (XIP) support.

For a file to be mmap()-able, the map area has to correspond to a range
of uncompressed and contiguous blocks, and in the MMU case it also has
to be page aligned. A version of mkcramfs with appropriate support is
necessary to create such a filesystem image.

In the MMU case it may happen for a vma structure to extend beyond the
actual file size. This is notably the case in binfmt_elf.c:elf_map().
Or the file's last block is shared with other files and cannot be mapped
as is. Rather than refusing to mmap it, we do a "mixed" map and let the
regular fault handler populate the unmapped area with RAM-backed pages.
In practice the unmapped area is seldom accessed so page faults might
never occur before this area is discarded.

In the non-MMU case it is the get_unmapped_area method that is responsible
for providing the address where the actual data can be found. No mapping
is necessary of course.

Signed-off-by: Nicolas Pitre 
Tested-by: Chris Brandt 
---
 fs/cramfs/inode.c | 194 ++
 1 file changed, 194 insertions(+)

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 6aa1d94ed8..071ce1eb58 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -15,7 +15,10 @@
 
 #include 
 #include 
+#include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -49,6 +52,7 @@ static inline struct cramfs_sb_info *CRAMFS_SB(struct 
super_block *sb)
 static const struct super_operations cramfs_ops;
 static const struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
+static const struct file_operations cramfs_physmem_fops;
 static const struct address_space_operations cramfs_aops;
 
 static DEFINE_MUTEX(read_mutex);
@@ -96,6 +100,10 @@ static struct inode *get_cramfs_inode(struct super_block 
*sb,
case S_IFREG:
inode->i_fop = _ro_fops;
inode->i_data.a_ops = _aops;
+   if (IS_ENABLED(CONFIG_CRAMFS_PHYSMEM) &&
+   CRAMFS_SB(sb)->flags & CRAMFS_FLAG_EXT_BLOCK_POINTERS &&
+   CRAMFS_SB(sb)->linear_phys_addr)
+   inode->i_fop = _physmem_fops;
break;
case S_IFDIR:
inode->i_op = _dir_inode_operations;
@@ -277,6 +285,192 @@ static void *cramfs_read(struct super_block *sb, unsigned 
int offset,
return NULL;
 }
 
+/*
+ * For a mapping to be possible, we need a range of uncompressed and
+ * contiguous blocks. Return the offset for the first block and number of
+ * valid blocks for which that is true, or zero otherwise.
+ */
+static u32 cramfs_get_block_range(struct inode *inode, u32 pgoff, u32 *pages)
+{
+   struct super_block *sb = inode->i_sb;
+   struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+   int i;
+   u32 *blockptrs, first_block_addr;
+
+   /*
+* We can dereference memory directly here as this code may be
+* reached only when there is a direct filesystem image mapping
+* available in memory.
+*/
+   blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode) + pgoff * 4);
+   first_block_addr = blockptrs[0] & ~CRAMFS_BLK_FLAGS;
+   i = 0;
+   do {
+   u32 block_off = i * (PAGE_SIZE >> CRAMFS_BLK_DIRECT_PTR_SHIFT);
+   u32 expect = (first_block_addr + block_off) |
+CRAMFS_BLK_FLAG_DIRECT_PTR |
+CRAMFS_BLK_FLAG_UNCOMPRESSED;
+   if (blockptrs[i] != expect) {
+   pr_debug("range: block %d/%d got %#x expects %#x\n",
+pgoff+i, pgoff + *pages - 1,
+blockptrs[i], expect);
+   if (i == 0)
+   return 0;
+   break;
+   }
+   } while (++i < *pages);
+
+   *pages = i;
+   return first_block_addr << CRAMFS_BLK_DIRECT_PTR_SHIFT;
+}
+
+#ifdef CONFIG_MMU
+
+static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+   struct inode *inode = file_inode(file);
+   struct super_block *sb = inode->i_sb;
+   struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+   unsigned int pages, max_pages, offset;
+   unsigned long address, pgoff = vma->vm_pgoff;
+   char *bailout_reason;
+   int ret;
+
+   if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+   return -EINVAL;
+
+   /* Could COW work here? */
+   bailout_reason = "vma is writable";
+   if (vma->vm_flags & VM_WRITE)
+   goto bailout;
+
+   max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   bailout_reason = "beyond file limit";
+   if (pgoff >= max_pages)
+   goto