Currently, /proc/<pid>/map_files/ is restricted to CAP_SYS_ADMIN, and
is only exposed if CONFIG_CHECKPOINT_RESTORE is set.

Each mapped file region gets a symlink in /proc/<pid>/map_files/
corresponding to the virtual address range at which it is mapped. The
symlinks work like the symlinks in /proc/<pid>/fd/, so you can follow
them to the backing file even if that backing file has been unlinked.

Currently, files which are mapped, unlinked, and closed are impossible
to stat() from userspace. Exposing /proc/<pid>/map_files/ closes this
functionality "hole".

Not being able to stat() such files makes noticing and explicitly
accounting for the space they use on the filesystem impossible. You can
work around this by summing up the space used by every file in the
filesystem and subtracting that total from what statfs() tells you, but
that obviously isn't great, and it becomes unworkable once your
filesystem becomes large enough.

This patch moves map_files/ out from behind CONFIG_CHECKPOINT_RESTORE,
and adjusts the permissions enforced on it as follows:

* proc_map_files_lookup()
* proc_map_files_readdir()
* map_files_d_revalidate()

        Remove the CAP_SYS_ADMIN restriction, leaving only the current
        restriction requiring PTRACE_MODE_READ. The information made
        available to userspace by these three functions is already
        available in /proc/PID/maps with MODE_READ, so I don't see any
        reason to limit them any further (see below for more detail).

* proc_map_files_follow_link()

        This stub has been added, and requires that the user have
        CAP_SYS_ADMIN in order to follow the links in map_files/,
        since there was concern on LKML both about the potential for
        bypassing permissions on ancestor directories in the path to
        files pointed to, and about what happens with more exotic
        memory mappings created by some drivers (ie dma-buf).

In older versions of this patch, I changed every permission check in
the four functions above to enforce MODE_ATTACH instead of MODE_READ.
This was an oversight on my part, and after revisiting the discussion
it seems that nobody was concerned about anything outside of what is
made possible by ->follow_link(). So in this version, I've left the
checks for PTRACE_MODE_READ as-is.

Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Andy Lutomirski <l...@amacapital.net>
Cc: Cyrill Gorcunov <gorcu...@openvz.org>
Cc: Joe Perches <j...@perches.com>
Cc: Kirill A. Shutemov <kir...@shutemov.name>
Reviewed-by: Kees Cook <keesc...@chromium.org>
Signed-off-by: Calvin Owens <calvinow...@fb.com>
---
Changes in v7:  Better commit message (hopefully), patch is otherwise
                identical to v6.

Changes in v6:  Require CAP_SYS_ADMIN for follow_link(). Leave other
                PTRACE_MODE_READ checks as-is, since CAP_SYS_ADMIN
                alone addresses all concerns raised AFAICS.

Changes in v5:  s/dentry->d_inode/d_inode(dentry)/g

Changes in v4:  Return -ESRCH from follow_link() when get_proc_task()
                returns NULL.

Changes in v3:  Changed permission checks to use PTRACE_MODE_ATTACH
                instead of PTRACE_MODE_READ, and added a stub to
                enforce MODE_ATTACH on follow_link() as well.

Changes in v2:  Removed the follow_link() stub that returned -EPERM if
                the caller didn't have CAP_SYS_ADMIN, since the caller
                in my chroot() scenario gets -EACCES anyway.


 fs/proc/base.c | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 093ca14..0270191 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1641,8 +1641,6 @@ end_instantiate:
        return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
-
 /*
  * dname_to_vma_addr - maps a dentry name into two unsigned longs
  * which represent vma start and end addresses.
@@ -1669,11 +1667,6 @@ static int map_files_d_revalidate(struct dentry *dentry, 
unsigned int flags)
        if (flags & LOOKUP_RCU)
                return -ECHILD;
 
-       if (!capable(CAP_SYS_ADMIN)) {
-               status = -EPERM;
-               goto out_notask;
-       }
-
        inode = d_inode(dentry);
        task = get_proc_task(inode);
        if (!task)
@@ -1762,6 +1755,28 @@ struct map_files_info {
        unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 };
 
+/*
+ * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+ * symlinks may be used to bypass permissions on ancestor directories in the
+ * path to the file in question.
+ */
+static void *proc_map_files_follow_link(struct dentry *dentry, struct 
nameidata *nd)
+{
+       if (!capable(CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       return proc_pid_follow_link(dentry, nd);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+       .readlink       = proc_pid_readlink,
+       .follow_link    = proc_map_files_follow_link,
+       .setattr        = proc_setattr,
+};
+
 static int
 proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
                           struct task_struct *task, const void *ptr)
@@ -1777,7 +1792,7 @@ proc_map_files_instantiate(struct inode *dir, struct 
dentry *dentry,
        ei = PROC_I(inode);
        ei->op.proc_get_link = proc_map_files_get_link;
 
-       inode->i_op = &proc_pid_link_inode_operations;
+       inode->i_op = &proc_map_files_link_inode_operations;
        inode->i_size = 64;
        inode->i_mode = S_IFLNK;
 
@@ -1801,10 +1816,6 @@ static struct dentry *proc_map_files_lookup(struct inode 
*dir,
        int result;
        struct mm_struct *mm;
 
-       result = -EPERM;
-       if (!capable(CAP_SYS_ADMIN))
-               goto out;
-
        result = -ENOENT;
        task = get_proc_task(dir);
        if (!task)
@@ -1858,10 +1869,6 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
        struct map_files_info *p;
        int ret;
 
-       ret = -EPERM;
-       if (!capable(CAP_SYS_ADMIN))
-               goto out;
-
        ret = -ENOENT;
        task = get_proc_task(file_inode(file));
        if (!task)
@@ -2050,7 +2057,6 @@ static const struct file_operations 
proc_timers_operations = {
        .llseek         = seq_lseek,
        .release        = seq_release_private,
 };
-#endif /* CONFIG_CHECKPOINT_RESTORE */
 
 static int proc_pident_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2549,9 +2555,7 @@ static const struct inode_operations 
proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, 
proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, 
proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
        DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, 
proc_map_files_operations),
-#endif
        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, 
proc_fdinfo_operations),
        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, 
proc_ns_dir_operations),
 #ifdef CONFIG_NET
-- 
1.8.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to