Some filesystems (NFS) need i_mutex in ->mmap(), this violates the normal
locking order. Provide a hook before we take mmap_sem.

This leaves a window between ->mmap_prepare() and ->mmap(), if thats a problem
(Trond?) we could also provide ->mmap_finish() and guarantee it being called
if ->mmap_prepare() returned success.

This would allow holding state and thereby close the window.

Signed-off-by: Peter Zijlstra <[EMAIL PROTECTED]>
---
 Documentation/filesystems/Locking |   11 ++++++++++-
 Documentation/filesystems/vfs.txt |    3 +++
 include/linux/fs.h                |    1 +
 ipc/shm.c                         |   13 +++++++++++++
 mm/mmap.c                         |   12 ++++++++++++
 mm/nommu.c                        |   12 ++++++++++++
 6 files changed, 51 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -1172,6 +1172,7 @@ struct file_operations {
        int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned 
long);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
+       int (*mmap_prepare) (struct file *, unsigned long len, unsigned long 
prot, unsigned long flags, unsigned long pgoff);
        int (*mmap) (struct file *, struct vm_area_struct *);
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *, fl_owner_t id);
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -1035,6 +1035,12 @@ unsigned long do_mmap_pgoff(struct file 
        struct mm_struct *mm = current->mm;
        unsigned long ret;
 
+       if (file && file->f_op && file->f_op->mmap_prepare) {
+               ret = file->f_op->mmap_prepare(file, len, prot, flags, pgoff);
+               if (ret)
+                       return ret;
+       }
+
        down_write(&mm->mmap_sem);
        ret = ___do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
        up_write(&mm->mmap_sem);
@@ -1054,6 +1060,12 @@ unsigned long do_mmap(struct file *file,
        if ((offset + PAGE_ALIGN(len)) < offset || (offset & ~PAGE_MASK))
                return ret;
 
+       if (file && file->f_op && file->f_op->mmap_prepare) {
+               ret = file->f_op->mmap_prepare(file, len, prot, flags, pgoff);
+               if (ret)
+                       return ret;
+       }
+
        down_write(&mm->mmap_sem);
        ret = ___do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
        up_write(&mm->mmap_sem);
Index: linux-2.6/mm/nommu.c
===================================================================
--- linux-2.6.orig/mm/nommu.c
+++ linux-2.6/mm/nommu.c
@@ -1025,6 +1025,12 @@ unsigned long do_mmap_pgoff(struct file 
        struct mm_struct *mm = current->mm;
        unsigned long ret;
 
+       if (file && file->f_op && file->f_op->mmap_prepare) {
+               ret = file->f_op->mmap_prepare(file, len, prot, flags, pgoff);
+               if (ret)
+                       return ret;
+       }
+
        down_write(&mm->mmap_sem);
        ret = ___do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
        up_write(&mm->mmap_sem);
@@ -1044,6 +1050,12 @@ unsigned long do_mmap(struct file *file,
        if ((offset + PAGE_ALIGN(len)) < offset || (offset & ~PAGE_MASK))
                return ret;
 
+       if (file && file->f_op && file->f_op->mmap_prepare) {
+               ret = file->f_op->mmap_prepare(file, len, prot, flags, pgoff);
+               if (ret)
+                       return ret;
+       }
+
        down_write(&mm->mmap_sem);
        ret = ___do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
        up_write(&mm->mmap_sem);
Index: linux-2.6/ipc/shm.c
===================================================================
--- linux-2.6.orig/ipc/shm.c
+++ linux-2.6/ipc/shm.c
@@ -300,6 +300,12 @@ static int shm_mmap(struct file * file, 
        struct shm_file_data *sfd = shm_file_data(file);
        int ret;
 
+       /*
+        * SHM backing filesystems may not have mmap_prepare!
+        * See so_shmat().
+        */
+       WARN_ON(sfd->file->f_op->mmap_prepare);
+
        ret = sfd->file->f_op->mmap(sfd->file, vma);
        if (ret != 0)
                return ret;
@@ -1012,6 +1018,13 @@ long do_shmat(int shmid, char __user *sh
                        goto invalid;
        }
                
+       /*
+        *  The usage of ___do_mmap_locked() is needed because we must already
+        *  hold the mmap_sem here due to find_vma_intersection vs mmap races.
+        *
+        *  This prohibits in SHM backing filesystems from using
+        *  f_op->mmap_prepare().
+        */
        user_addr = ___do_mmap_pgoff (file, addr, size, prot, flags, 0);
        *raddr = user_addr;
        err = 0;
Index: linux-2.6/Documentation/filesystems/Locking
===================================================================
--- linux-2.6.orig/Documentation/filesystems/Locking
+++ linux-2.6/Documentation/filesystems/Locking
@@ -378,6 +378,8 @@ prototypes:
                        unsigned long);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
+       int (*mmap_prepare) (struct file *, unsigned long len, unsigned long 
prot,
+                       unsigned long flags, unsigned long pgoff);
        int (*mmap) (struct file *, struct vm_area_struct *);
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *);
@@ -413,7 +415,8 @@ poll:                       no
 ioctl:                 yes     (see below)
 unlocked_ioctl:                no      (see below)
 compat_ioctl:          no
-mmap:                  no
+mmap_prepare:          no      (see below)
+mmap:                  no      (see below)
 open:                  maybe   (see below)
 flush:                 no
 release:               no
@@ -436,6 +439,12 @@ For many filesystems, it is probably saf
 semaphore.  Note some filesystems (i.e. remote ones) provide no
 protection for i_size so you will need to use the BKL.
 
+->mmap_prepare() is called on mmap(2) _before_ acquisition of the mmap_sem,
+filesystems can use this hook to prepare the file for being mapped, and can
+take i_mutex if they need to.
+
+->mmap() is called while the mmap_sem is held.
+
 ->open() locking is in-transit: big lock partially moved into the methods.
 The only exception is ->open() in the instances of file_operations that never
 end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices
Index: linux-2.6/Documentation/filesystems/vfs.txt
===================================================================
--- linux-2.6.orig/Documentation/filesystems/vfs.txt
+++ linux-2.6/Documentation/filesystems/vfs.txt
@@ -762,6 +762,7 @@ struct file_operations {
        int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned 
long);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
+       int (*mmap_prepare) (struct file *, unsigned long len, unsigned long 
prot, unsigned long flags, unsigned long pgoff);
        int (*mmap) (struct file *, struct vm_area_struct *);
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *);
@@ -809,6 +810,8 @@ otherwise noted.
   compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
         are used on 64 bit kernels.
 
+  mmap_prepare: called by the mmap(2) system call
+
   mmap: called by the mmap(2) system call
 
   open: called by the VFS when an inode should be opened. When the VFS

--

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to