Re: [PATCH v5 4/4] vduse: Add LSM hook to check Virtio device type

2023-12-15 Thread Serge E. Hallyn
On Tue, Dec 12, 2023 at 02:55:33PM -0800, Casey Schaufler wrote:
> On 12/12/2023 9:59 AM, Michael S. Tsirkin wrote:
> > On Tue, Dec 12, 2023 at 08:33:39AM -0800, Casey Schaufler wrote:
> >> On 12/12/2023 5:17 AM, Maxime Coquelin wrote:
> >>> This patch introduces a LSM hook for devices creation,
> >>> destruction (ioctl()) and opening (open()) operations,
> >>> checking the application is allowed to perform these
> >>> operations for the Virtio device type.
> >> My earlier comments on a vduse specific LSM hook still hold.
> >> I would much prefer to see a device permissions hook(s) that
> >> are useful for devices in general. Not just vduse devices.
> >> I know that there are already some very special purpose LSM
> >> hooks, but the experience with maintaining them is why I don't
> >> want more of them. 
> > What exactly does this mean?
> 
> You have proposed an LSM hook that is only useful for vduse.
> You want to implement a set of controls that only apply to vduse.
> I can't help but think that if someone (i.e. you) wants to control
> device creation for vduse that there could well be a use case for
> control over device creation for some other set of devices. It is
> quite possible that someone out there is desperately trying to
> solve the same problem you have, but with a different device.
> 
> I have no desire to have to deal with
>   security_vduse_perm_check()
>   security_odddev_perm_check()
>   ...
>   security_evendev_perm_check()
> 
> when we should be able to have
>   security_device_perm_check()
> 
> that can service them all.
> 
> 
> >  Devices like tap etc? How do we
> > find them all though?
> 
> I'm not suggesting you find them all. I'm suggesting that you provide
> an interface that someone could use if they wanted to. I think you
> will be surprised how many will appear (with complaints about the
> interface you propose, of course) if you implement a generally useful
> LSM hook.

Right now you have create, destroy, and open.  Are you expecting to add
other perms?  These sound generic enough that it definitely seems worth
doing as Casey suggests.  On the other hand, if this could become a
gateway to lsm device access hooks basically becoming ioctl, we might
want to consider that.

> >>> Signed-off-by: Maxime Coquelin 
> >>> ---
> >>>  MAINTAINERS |  1 +
> >>>  drivers/vdpa/vdpa_user/vduse_dev.c  | 13 
> >>>  include/linux/lsm_hook_defs.h   |  2 ++
> >>>  include/linux/security.h|  6 ++
> >>>  include/linux/vduse.h   | 14 +
> >>>  security/security.c | 15 ++
> >>>  security/selinux/hooks.c| 32 +
> >>>  security/selinux/include/classmap.h |  2 ++
> >>>  8 files changed, 85 insertions(+)
> >>>  create mode 100644 include/linux/vduse.h
> >>>
> >>> diff --git a/MAINTAINERS b/MAINTAINERS
> >>> index a0fb0df07b43..4e83b14358d2 100644
> >>> --- a/MAINTAINERS
> >>> +++ b/MAINTAINERS
> >>> @@ -23040,6 +23040,7 @@ F:drivers/net/virtio_net.c
> >>>  F:   drivers/vdpa/
> >>>  F:   drivers/virtio/
> >>>  F:   include/linux/vdpa.h
> >>> +F:   include/linux/vduse.h
> >>>  F:   include/linux/virtio*.h
> >>>  F:   include/linux/vringh.h
> >>>  F:   include/uapi/linux/virtio_*.h
> >>> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
> >>> b/drivers/vdpa/vdpa_user/vduse_dev.c
> >>> index fa62825be378..59ab7eb62e20 100644
> >>> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> >>> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> >>> @@ -8,6 +8,7 @@
> >>>   *
> >>>   */
> >>>  
> >>> +#include "linux/security.h"
> >>>  #include 
> >>>  #include 
> >>>  #include 
> >>> @@ -30,6 +31,7 @@
> >>>  #include 
> >>>  #include 
> >>>  #include 
> >>> +#include 
> >>>  
> >>>  #include "iova_domain.h"
> >>>  
> >>> @@ -1442,6 +1444,10 @@ static int vduse_dev_open(struct inode *inode, 
> >>> struct file *file)
> >>>   if (dev->connected)
> >>>   goto unlock;
> >>>  
> >>> + ret = -EPERM;
> >>> + if (security_vduse_perm_check(VDUSE_PERM_OPEN, dev->device_id))
> >>> + goto unlock;
> >>> +
> >>>   ret = 0;
> >>>   dev->connected = true;
> >>>   file->private_data = dev;
> >>> @@ -1664,6 +1670,9 @@ static int vduse_destroy_dev(char *name)
> >>>   if (!dev)
> >>>   return -EINVAL;
> >>>  
> >>> + if (security_vduse_perm_check(VDUSE_PERM_DESTROY, dev->device_id))
> >>> + return -EPERM;
> >>> +
> >>>   mutex_lock(>lock);
> >>>   if (dev->vdev || dev->connected) {
> >>>   mutex_unlock(>lock);
> >>> @@ -1828,6 +1837,10 @@ static int vduse_create_dev(struct 
> >>> vduse_dev_config *config,
> >>>   int ret;
> >>>   struct vduse_dev *dev;
> >>>  
> >>> + ret = -EPERM;
> >>> + if (security_vduse_perm_check(VDUSE_PERM_CREATE, config->device_id))
> >>> + goto err;
> >>> +
> >>>   ret = -EEXIST;
> >>>   if (vduse_find_dev(config->name))
> >>>   goto err;
> >>> diff --git 

[PATCH v3.4] capabilities: require CAP_SETFCAP to map uid 0

2021-04-20 Thread Serge E. Hallyn
cap_setfcap is required to create file capabilities.

Since 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"), a
process running as uid 0 but without cap_setfcap is able to work around
this as follows: unshare a new user namespace which maps parent uid 0
into the child namespace.  While this task will not have new
capabilities against the parent namespace, there is a loophole due to
the way namespaced file capabilities are represented as xattrs.  File
capabilities valid in userns 1 are distinguished from file capabilities
valid in userns 2 by the kuid which underlies uid 0.  Therefore the
restricted root process can unshare a new self-mapping namespace, add a
namespaced file capability onto a file, then use that file capability in
the parent namespace.

To prevent that, do not allow mapping parent uid 0 if the process which
opened the uid_map file does not have CAP_SETFCAP, which is the capability
for setting file capabilities.

As a further wrinkle:  a task can unshare its user namespace, then
open its uid_map file itself, and map (only) its own uid.  In this
case we do not have the credential from before unshare,  which was
potentially more restricted.  So, when creating a user namespace, we
record whether the creator had CAP_SETFCAP.  Then we can use that
during map_write().

With this patch:

1. Unprivileged user can still unshare -Ur

ubuntu@caps:~$ unshare -Ur
root@caps:~# logout

2. Root user can still unshare -Ur

ubuntu@caps:~$ sudo bash
root@caps:/home/ubuntu# unshare -Ur
root@caps:/home/ubuntu# logout

3. Root user without CAP_SETFCAP cannot unshare -Ur:

root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
unable to set CAP_SETFCAP effective capability: Operation not permitted
root@caps:/home/ubuntu# unshare -Ur
unshare: write failed /proc/self/uid_map: Operation not permitted

Note: an alternative solution would be to allow uid 0 mappings by
processes without CAP_SETFCAP, but to prevent such a namespace from
writing any file capabilities.  This approach can be seen here:

https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4

History:

Commit 95ebabde382 ("capabilities: Don't allow writing ambiguous v3 file
capabilities") tried to fix the issue by preventing v3 fscaps to be
written to disk when the root uid would map to the same uid in nested
user namespaces. This led to regressions for various workloads. For
example, see [1]. Ultimately this is a valid use-case we have to support
meaning we had to revert this change in 3b0c2d3eaa83 ("Revert
95ebabde382c ("capabilities: Don't allow writing ambiguous v3 file
capabilities")").

[1]: https://github.com/containers/buildah/issues/3071

Signed-off-by: Serge Hallyn 
Reviewed-by: Andrew G. Morgan 
Tested-by: Christian Brauner 
Reviewed-by: Christian Brauner 
Tested-by: Giuseppe Scrivano 
Cc: "Eric W. Biederman" 

Changelog:
   * fix logic in the case of writing to another task's uid_map
   * rename 'ns' to 'map_ns', and make a file_ns local variable
   * use /* comments */
   * update the CAP_SETFCAP comment in capability.h
   * rename parent_unpriv to parent_can_setfcap (and reverse the
 logic)
   * remove printks
   * clarify (i hope) the code comments
   * update capability.h comment
   * renamed parent_can_setfcap to parent_could_setfcap
   * made the check its own disallowed_0_mapping() fn
   * moved the check into new_idmap_permitted
   * rename disallowed_0_mapping to verify_root_mapping
   * change verify_root_mapping to Christian's suggested flow
   * correct+clarify comments: parent uid 0 mapping to any
 child uid is a problem.
   * remove unused lower_first variable.
---
 include/linux/user_namespace.h  |  3 ++
 include/uapi/linux/capability.h |  3 +-
 kernel/user_namespace.c | 65 +++--
 3 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 64cf8ebdc4ec..f6c5f784be5a 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -63,6 +63,9 @@ struct user_namespace {
kgid_t  group;
struct ns_commonns;
unsigned long   flags;
+   /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
+* in its effective capability set at the child ns creation time. */
+   boolparent_could_setfcap;
 
 #ifdef CONFIG_KEYS
/* List of joinable keyrings in this namespace.  Modification access of
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index c6ca33034147..2ddb4226cd23 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -335,7 +335,8 @@ struct vfs_ns_cap_data {
 
 #define CAP_AUDIT_CONTROL30
 
-/* Set or remove capabilities on files */
+/* Set or remove capabilities on files.
+   Map uid=0 into a child 

Re: [PATCH] capabilities: require CAP_SETFCAP to map uid 0 (v3.2)

2021-04-20 Thread Serge E. Hallyn
On Mon, Apr 19, 2021 at 05:52:39PM +0200, Giuseppe Scrivano wrote:
> ebied...@xmission.com (Eric W. Biederman) writes:
> 
> > Guiseppe can you take a look at this?
> >
> > This is a second attempt at tightening up the semantics of writing to
> > file capabilities from a user namespace.
> >
> > The first attempt was reverted with 3b0c2d3eaa83 ("Revert 95ebabde382c
> > ("capabilities: Don't allow writing ambiguous v3 file capabilities")"),
> > which corrected the issue reported in:
> > https://github.com/containers/buildah/issues/3071
> >
> > There is a report the podman testsuite passes.  While different this
> > looks in many ways much more strict than the code that was reverted.  So
> > while I can imagine this change doesn't cause problems as is, I will be
> > surprised.
> 
> thanks for pulling me in the discussion.
> 
> I've tested the patch with several cases similar to the issue we had in
> the past and the patch seems to work well.  
> 
> Podman creates all the user namespaces within the same parent user
> namespace.  In the parent user namespace all the capabilities are kept
> and AFAIK Docker does the same.  I'd expect a change in behavior only
> for nested user namespaces in containers where CAP_SETFCAP is not
> granted, but that is not a common configuration given that CAP_SETFCAP
> is added by default.
> 
> 
> > "Serge E. Hallyn"  writes:
> >
> >> +/**
> >> + * verify_root_map() - check the uid 0 mapping
> >> + * @file: idmapping file
> >> + * @map_ns: user namespace of the target process
> >> + * @new_map: requested idmap
> >> + *
> >> + * If a process requested a mapping for uid 0 onto uid 0, verify that the
> >> + * process writing the map had the CAP_SETFCAP capability as the target 
> >> process
> >> + * will be able to write fscaps that are valid in ancestor user 
> >> namespaces.
> >> + *
> >> + * Return: true if the mapping is allowed, false if not.
> >> + */
> >> +static bool verify_root_map(const struct file *file,
> >> +  struct user_namespace *map_ns,
> >> +  struct uid_gid_map *new_map)
> >> +{
> >> +  int idx;
> >> +  const struct user_namespace *file_ns = file->f_cred->user_ns;
> >> +  struct uid_gid_extent *extent0 = NULL;
> >> +
> >> +  for (idx = 0; idx < new_map->nr_extents; idx++) {
> >> +  u32 lower_first;
> 
> nit: lower_first seems unused?

Drat - I noticed that Sunday or Monday and forgot to remove it, thanks.

> >> +
> >> +  if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
> >> +  extent0 = _map->extent[idx];
> >> +  else
> >> +  extent0 = _map->forward[idx];
> >> +  if (extent0->lower_first == 0)
> >> +  break;
> >> +
> >> +  extent0 = NULL;
> >> +  }
> 
> Tested-by: Giuseppe Scrivano 

Awesome - thanks for testing.


Re: [PATCH] capabilities: require CAP_SETFCAP to map uid 0 (v3.3)

2021-04-19 Thread Serge E. Hallyn
On Mon, Apr 19, 2021 at 06:09:11PM +0200, Christian Brauner wrote:
> On Mon, Apr 19, 2021 at 07:25:14AM -0500, Serge Hallyn wrote:
> > cap_setfcap is required to create file capabilities.
> > 
> > Since 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"), a
> > process running as uid 0 but without cap_setfcap is able to work around
> > this as follows: unshare a new user namespace which maps parent uid 0
> > into the child namespace.  While this task will not have new
> > capabilities against the parent namespace, there is a loophole due to
> > the way namespaced file capabilities are represented as xattrs.  File
> > capabilities valid in userns 1 are distinguished from file capabilities
> > valid in userns 2 by the kuid which underlies uid 0.  Therefore the
> > restricted root process can unshare a new self-mapping namespace, add a
> > namespaced file capability onto a file, then use that file capability in
> > the parent namespace.
> > 
> > To prevent that, do not allow mapping parent uid 0 if the process which
> > opened the uid_map file does not have CAP_SETFCAP, which is the capability
> > for setting file capabilities.
> > 
> > As a further wrinkle:  a task can unshare its user namespace, then
> > open its uid_map file itself, and map (only) its own uid.  In this
> > case we do not have the credential from before unshare,  which was
> > potentially more restricted.  So, when creating a user namespace, we
> > record whether the creator had CAP_SETFCAP.  Then we can use that
> > during map_write().
> > 
> > With this patch:
> > 
> > 1. Unprivileged user can still unshare -Ur
> > 
> > ubuntu@caps:~$ unshare -Ur
> > root@caps:~# logout
> > 
> > 2. Root user can still unshare -Ur
> > 
> > ubuntu@caps:~$ sudo bash
> > root@caps:/home/ubuntu# unshare -Ur
> > root@caps:/home/ubuntu# logout
> > 
> > 3. Root user without CAP_SETFCAP cannot unshare -Ur:
> > 
> > root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
> > root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
> > unable to set CAP_SETFCAP effective capability: Operation not permitted
> > root@caps:/home/ubuntu# unshare -Ur
> > unshare: write failed /proc/self/uid_map: Operation not permitted
> > 
> > Note: an alternative solution would be to allow uid 0 mappings by
> > processes without CAP_SETFCAP, but to prevent such a namespace from
> > writing any file capabilities.  This approach can be seen here:
> > 
> > https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4
> > 
> 
> Ah, can you link to the previous fix and its revert, please? I think
> that was mentioned in the formerly private thread as well but we forgot:
> 
> commit 95ebabde382c371572297915b104e55403674e73
> Author: Eric W. Biederman 
> Date:   Thu Dec 17 09:42:00 2020 -0600
> 
> capabilities: Don't allow writing ambiguous v3 file capabilities
> 
> commit 3b0c2d3eaa83da259d7726192cf55a137769012f
> Author: Eric W. Biederman 
> Date:   Fri Mar 12 15:07:09 2021 -0600
> 
> Revert 95ebabde382c ("capabilities: Don't allow writing ambiguous v3 file 
> capabilities")

Sure.

Is there a tag for that kind of thing or do I just mention it at the end
of the description?


[PATCH] capabilities: require CAP_SETFCAP to map uid 0 (v3.3)

2021-04-19 Thread Serge E. Hallyn
cap_setfcap is required to create file capabilities.

Since 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"), a
process running as uid 0 but without cap_setfcap is able to work around
this as follows: unshare a new user namespace which maps parent uid 0
into the child namespace.  While this task will not have new
capabilities against the parent namespace, there is a loophole due to
the way namespaced file capabilities are represented as xattrs.  File
capabilities valid in userns 1 are distinguished from file capabilities
valid in userns 2 by the kuid which underlies uid 0.  Therefore the
restricted root process can unshare a new self-mapping namespace, add a
namespaced file capability onto a file, then use that file capability in
the parent namespace.

To prevent that, do not allow mapping parent uid 0 if the process which
opened the uid_map file does not have CAP_SETFCAP, which is the capability
for setting file capabilities.

As a further wrinkle:  a task can unshare its user namespace, then
open its uid_map file itself, and map (only) its own uid.  In this
case we do not have the credential from before unshare,  which was
potentially more restricted.  So, when creating a user namespace, we
record whether the creator had CAP_SETFCAP.  Then we can use that
during map_write().

With this patch:

1. Unprivileged user can still unshare -Ur

ubuntu@caps:~$ unshare -Ur
root@caps:~# logout

2. Root user can still unshare -Ur

ubuntu@caps:~$ sudo bash
root@caps:/home/ubuntu# unshare -Ur
root@caps:/home/ubuntu# logout

3. Root user without CAP_SETFCAP cannot unshare -Ur:

root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
unable to set CAP_SETFCAP effective capability: Operation not permitted
root@caps:/home/ubuntu# unshare -Ur
unshare: write failed /proc/self/uid_map: Operation not permitted

Note: an alternative solution would be to allow uid 0 mappings by
processes without CAP_SETFCAP, but to prevent such a namespace from
writing any file capabilities.  This approach can be seen here:

https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4

Signed-off-by: Serge Hallyn 
Reviewed-by: Andrew G. Morgan 
Tested-by: Christian Brauner 
Reviewed-by: Christian Brauner 
Cc: "Eric W. Biederman" 

Changelog:
   * fix logic in the case of writing to another task's uid_map
   * rename 'ns' to 'map_ns', and make a file_ns local variable
   * use /* comments */
   * update the CAP_SETFCAP comment in capability.h
   * rename parent_unpriv to parent_can_setfcap (and reverse the
 logic)
   * remove printks
   * clarify (i hope) the code comments
   * update capability.h comment
   * renamed parent_can_setfcap to parent_could_setfcap
   * made the check its own disallowed_0_mapping() fn
   * moved the check into new_idmap_permitted
   * rename disallowed_0_mapping to verify_root_mapping
   * change verify_root_mapping to Christian's suggested flow
   * correct+clarify comments: parent uid 0 mapping to any
 child uid is a problem.
---
 include/linux/user_namespace.h  |  3 ++
 include/uapi/linux/capability.h |  3 +-
 kernel/user_namespace.c | 67 +++--
 3 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 64cf8ebdc4ec..f6c5f784be5a 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -63,6 +63,9 @@ struct user_namespace {
kgid_t  group;
struct ns_commonns;
unsigned long   flags;
+   /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
+* in its effective capability set at the child ns creation time. */
+   boolparent_could_setfcap;
 
 #ifdef CONFIG_KEYS
/* List of joinable keyrings in this namespace.  Modification access of
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index c6ca33034147..2ddb4226cd23 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -335,7 +335,8 @@ struct vfs_ns_cap_data {
 
 #define CAP_AUDIT_CONTROL30
 
-/* Set or remove capabilities on files */
+/* Set or remove capabilities on files.
+   Map uid=0 into a child user namespace. */
 
 #define CAP_SETFCAP 31
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index af612945a4d0..609a729a9879 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -106,6 +106,7 @@ int create_user_ns(struct cred *new)
if (!ns)
goto fail_dec;
 
+   ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
ret = ns_alloc_inum(>ns);
if (ret)
goto fail_free;
@@ -841,6 +842,62 @@ static int sort_idmaps(struct uid_gid_map *map)
return 0;
 }
 
+/**
+ * verify_root_map() - check the uid 0 mapping
+ * 

[PATCH] capabilities: require CAP_SETFCAP to map uid 0 (v3.2)

2021-04-17 Thread Serge E. Hallyn
A process running as uid 0 but without cap_setfcap currently can simply
unshare a new user namespace with uid 0 mapped to 0.  While this task
will not have new capabilities against the parent namespace, there is
a loophole due to the way namespaced file capabilities work.  File
capabilities valid in userns 1 are distinguised from file capabilities
valid in userns 2 by the kuid which underlies uid 0.  Therefore
the restricted root process can unshare a new self-mapping namespace,
add a namespaced file capability onto a file, then use that file
capability in the parent namespace.

To prevent that, do not allow mapping uid 0 if the process which
opened the uid_map file does not have CAP_SETFCAP, which is the capability
for setting file capabilities.

A further wrinkle:  a task can unshare its user namespace, then
open its uid_map file itself, and map (only) its own uid.  In this
case we do not have the credential from before unshare,  which was
potentially more restricted.  So, when creating a user namespace, we
record whether the creator had CAP_SETFCAP.  Then we can use that
during map_write().

With this patch:

1. unprivileged user can still unshare -Ur

ubuntu@caps:~$ unshare -Ur
root@caps:~# logout

2. root user can still unshare -Ur

ubuntu@caps:~$ sudo bash
root@caps:/home/ubuntu# unshare -Ur
root@caps:/home/ubuntu# logout

3. root user without CAP_SETFCAP cannot unshare -Ur:

root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
unable to set CAP_SETFCAP effective capability: Operation not permitted
root@caps:/home/ubuntu# unshare -Ur
unshare: write failed /proc/self/uid_map: Operation not permitted

Signed-off-by: Serge Hallyn 

Changelog:
   * fix logic in the case of writing to another task's uid_map
   * rename 'ns' to 'map_ns', and make a file_ns local variable
   * use /* comments */
   * update the CAP_SETFCAP comment in capability.h
   * rename parent_unpriv to parent_can_setfcap (and reverse the
 logic)
   * remove printks
   * clarify (i hope) the code comments
   * update capability.h comment
   * renamed parent_can_setfcap to parent_could_setfcap
   * made the check its own disallowed_0_mapping() fn
   * moved the check into new_idmap_permitted
   * rename disallowed_0_mapping to verify_root_mapping
   * change verify_root_mapping to Christian's suggested flow
---
 include/linux/user_namespace.h  |  3 ++
 include/uapi/linux/capability.h |  3 +-
 kernel/user_namespace.c | 66 +++--
 3 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 64cf8ebdc4ec..f6c5f784be5a 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -63,6 +63,9 @@ struct user_namespace {
kgid_t  group;
struct ns_commonns;
unsigned long   flags;
+   /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
+* in its effective capability set at the child ns creation time. */
+   boolparent_could_setfcap;
 
 #ifdef CONFIG_KEYS
/* List of joinable keyrings in this namespace.  Modification access of
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index c6ca33034147..2ddb4226cd23 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -335,7 +335,8 @@ struct vfs_ns_cap_data {
 
 #define CAP_AUDIT_CONTROL30
 
-/* Set or remove capabilities on files */
+/* Set or remove capabilities on files.
+   Map uid=0 into a child user namespace. */
 
 #define CAP_SETFCAP 31
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index af612945a4d0..2ead291177b0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -106,6 +106,7 @@ int create_user_ns(struct cred *new)
if (!ns)
goto fail_dec;
 
+   ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
ret = ns_alloc_inum(>ns);
if (ret)
goto fail_free;
@@ -841,6 +842,61 @@ static int sort_idmaps(struct uid_gid_map *map)
return 0;
 }
 
+/**
+ * verify_root_map() - check the uid 0 mapping
+ * @file: idmapping file
+ * @map_ns: user namespace of the target process
+ * @new_map: requested idmap
+ *
+ * If a process requested a mapping for uid 0 onto uid 0, verify that the
+ * process writing the map had the CAP_SETFCAP capability as the target process
+ * will be able to write fscaps that are valid in ancestor user namespaces.
+ *
+ * Return: true if the mapping is allowed, false if not.
+ */
+static bool verify_root_map(const struct file *file,
+   struct user_namespace *map_ns,
+   struct uid_gid_map *new_map)
+{
+   int idx;
+   const struct user_namespace *file_ns = file->f_cred->user_ns;
+   struct uid_gid_extent *extent0 = NULL;

Re: [RFC PATCH] capabilities: require CAP_SETFCAP to map uid 0 (v3)

2021-04-16 Thread Serge E. Hallyn
On Fri, Apr 16, 2021 at 04:34:53PM -0500, Serge E. Hallyn wrote:
> On Fri, Apr 16, 2021 at 05:05:01PM +0200, Christian Brauner wrote:
> > On Thu, Apr 15, 2021 at 11:58:51PM -0500, Serge Hallyn wrote:
> > > (Eric - this patch (v3) is a cleaned up version of the previous approach.
> > > v4 is at 
> > > https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4
> > > and is the approach you suggested.  I can send it also as a separate patch
> > > if you like)
> > > 
> > > A process running as uid 0 but without cap_setfcap currently can simply
> > > unshare a new user namespace with uid 0 mapped to 0.  While this task
> > > will not have new capabilities against the parent namespace, there is
> > > a loophole due to the way namespaced file capabilities work.  File
> > > capabilities valid in userns 1 are distinguised from file capabilities
> > > valid in userns 2 by the kuid which underlies uid 0.  Therefore
> > > the restricted root process can unshare a new self-mapping namespace,
> > > add a namespaced file capability onto a file, then use that file
> > > capability in the parent namespace.
> > > 
> > > To prevent that, do not allow mapping uid 0 if the process which
> > > opened the uid_map file does not have CAP_SETFCAP, which is the capability
> > > for setting file capabilities.
> > > 
> > > A further wrinkle:  a task can unshare its user namespace, then
> > > open its uid_map file itself, and map (only) its own uid.  In this
> > > case we do not have the credential from before unshare,  which was
> > > potentially more restricted.  So, when creating a user namespace, we
> > > record whether the creator had CAP_SETFCAP.  Then we can use that
> > > during map_write().
> > > 
> > > With this patch:
> > > 
> > > 1. unprivileged user can still unshare -Ur
> > > 
> > > ubuntu@caps:~$ unshare -Ur
> > > root@caps:~# logout
> > > 
> > > 2. root user can still unshare -Ur
> > > 
> > > ubuntu@caps:~$ sudo bash
> > > root@caps:/home/ubuntu# unshare -Ur
> > > root@caps:/home/ubuntu# logout
> > > 
> > > 3. root user without CAP_SETFCAP cannot unshare -Ur:
> > > 
> > > root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
> > > root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
> > > unable to set CAP_SETFCAP effective capability: Operation not permitted
> > > root@caps:/home/ubuntu# unshare -Ur
> > > unshare: write failed /proc/self/uid_map: Operation not permitted
> > > 
> > > Signed-off-by: Serge Hallyn 
> > > 
> > > Changelog:
> > >* fix logic in the case of writing to another task's uid_map
> > >* rename 'ns' to 'map_ns', and make a file_ns local variable
> > >* use /* comments */
> > >* update the CAP_SETFCAP comment in capability.h
> > >* rename parent_unpriv to parent_can_setfcap (and reverse the
> > >  logic)
> > >* remove printks
> > >* clarify (i hope) the code comments
> > >* update capability.h comment
> > >* renamed parent_can_setfcap to parent_could_setfcap
> > >* made the check its own disallowed_0_mapping() fn
> > >* moved the check into new_idmap_permitted
> > > ---
> > 
> > Thank you for working on this fix!
> > 
> > I do prefer your approach of doing the check at user namespace creation
> > time instead of moving it into the setxattr() codepath.
> > 
> > Let me reiterate that the ability to write through fscaps is a valid
> > usecase and this should continue to work but that for locked down user
> > namespace as Andrew wants to use them your patch provides a clean
> > solution.
> > We've are using identity mappings in quite a few scenarios partially
> > when performing tests but also to write through fscaps.
> > We also had reports of users that use identity mappings. They create
> > their rootfs by running image extraction in an identity mapped userns
> > where fscaps are written through.
> > Podman has use-cases for this feature as well and has been affected by
> > the regression of the first fix.
> 
> Thanks for reviewing.
> 
> I'm not sure what your point above is, so just to make sure - the
> alternative implementation also does allow fscaps for cases where
> root uid is remapped, only disallowing it if it would violate the
> ancestor's lack of cap_setfcap.
> 
> 
> > >

Re: [RFC PATCH] capabilities: require CAP_SETFCAP to map uid 0 (v3)

2021-04-16 Thread Serge E. Hallyn
On Fri, Apr 16, 2021 at 05:05:01PM +0200, Christian Brauner wrote:
> On Thu, Apr 15, 2021 at 11:58:51PM -0500, Serge Hallyn wrote:
> > (Eric - this patch (v3) is a cleaned up version of the previous approach.
> > v4 is at 
> > https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4
> > and is the approach you suggested.  I can send it also as a separate patch
> > if you like)
> > 
> > A process running as uid 0 but without cap_setfcap currently can simply
> > unshare a new user namespace with uid 0 mapped to 0.  While this task
> > will not have new capabilities against the parent namespace, there is
> > a loophole due to the way namespaced file capabilities work.  File
> > capabilities valid in userns 1 are distinguised from file capabilities
> > valid in userns 2 by the kuid which underlies uid 0.  Therefore
> > the restricted root process can unshare a new self-mapping namespace,
> > add a namespaced file capability onto a file, then use that file
> > capability in the parent namespace.
> > 
> > To prevent that, do not allow mapping uid 0 if the process which
> > opened the uid_map file does not have CAP_SETFCAP, which is the capability
> > for setting file capabilities.
> > 
> > A further wrinkle:  a task can unshare its user namespace, then
> > open its uid_map file itself, and map (only) its own uid.  In this
> > case we do not have the credential from before unshare,  which was
> > potentially more restricted.  So, when creating a user namespace, we
> > record whether the creator had CAP_SETFCAP.  Then we can use that
> > during map_write().
> > 
> > With this patch:
> > 
> > 1. unprivileged user can still unshare -Ur
> > 
> > ubuntu@caps:~$ unshare -Ur
> > root@caps:~# logout
> > 
> > 2. root user can still unshare -Ur
> > 
> > ubuntu@caps:~$ sudo bash
> > root@caps:/home/ubuntu# unshare -Ur
> > root@caps:/home/ubuntu# logout
> > 
> > 3. root user without CAP_SETFCAP cannot unshare -Ur:
> > 
> > root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
> > root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
> > unable to set CAP_SETFCAP effective capability: Operation not permitted
> > root@caps:/home/ubuntu# unshare -Ur
> > unshare: write failed /proc/self/uid_map: Operation not permitted
> > 
> > Signed-off-by: Serge Hallyn 
> > 
> > Changelog:
> >* fix logic in the case of writing to another task's uid_map
> >* rename 'ns' to 'map_ns', and make a file_ns local variable
> >* use /* comments */
> >* update the CAP_SETFCAP comment in capability.h
> >* rename parent_unpriv to parent_can_setfcap (and reverse the
> >  logic)
> >* remove printks
> >* clarify (i hope) the code comments
> >* update capability.h comment
> >* renamed parent_can_setfcap to parent_could_setfcap
> >* made the check its own disallowed_0_mapping() fn
> >* moved the check into new_idmap_permitted
> > ---
> 
> Thank you for working on this fix!
> 
> I do prefer your approach of doing the check at user namespace creation
> time instead of moving it into the setxattr() codepath.
> 
> Let me reiterate that the ability to write through fscaps is a valid
> usecase and this should continue to work but that for locked down user
> namespace as Andrew wants to use them your patch provides a clean
> solution.
> We've are using identity mappings in quite a few scenarios partially
> when performing tests but also to write through fscaps.
> We also had reports of users that use identity mappings. They create
> their rootfs by running image extraction in an identity mapped userns
> where fscaps are written through.
> Podman has use-cases for this feature as well and has been affected by
> the regression of the first fix.

Thanks for reviewing.

I'm not sure what your point above is, so just to make sure - the
alternative implementation also does allow fscaps for cases where
root uid is remapped, only disallowing it if it would violate the
ancestor's lack of cap_setfcap.


> >  include/linux/user_namespace.h  |  3 ++
> >  include/uapi/linux/capability.h |  3 +-
> >  kernel/user_namespace.c | 61 +++--
> >  3 files changed, 63 insertions(+), 4 deletions(-)
> > 
> > diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> > index 64cf8ebdc4ec..f6c5f784be5a 100644
> > --- a/include/linux/user_namespace.h
> > +++ b/include/linux/user_namespace.h
> > @@ -63,6 +63,9 @@ struct user_namespace {
> > kgid_t  group;
> > struct ns_commonns;
> > unsigned long   flags;
> > +   /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
> > +* in its effective capability set at the child ns creation time. */
> > +   boolparent_could_setfcap;
> >  
> >  #ifdef CONFIG_KEYS
> > /* List of joinable keyrings in this namespace.  Modification access of
> > diff --git a/include/uapi/linux/capability.h 
> > 

[RFC PATCH] capabilities: require CAP_SETFCAP to map uid 0 (v3)

2021-04-15 Thread Serge E. Hallyn
(Eric - this patch (v3) is a cleaned up version of the previous approach.
v4 is at 
https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4
and is the approach you suggested.  I can send it also as a separate patch
if you like)

A process running as uid 0 but without cap_setfcap currently can simply
unshare a new user namespace with uid 0 mapped to 0.  While this task
will not have new capabilities against the parent namespace, there is
a loophole due to the way namespaced file capabilities work.  File
capabilities valid in userns 1 are distinguised from file capabilities
valid in userns 2 by the kuid which underlies uid 0.  Therefore
the restricted root process can unshare a new self-mapping namespace,
add a namespaced file capability onto a file, then use that file
capability in the parent namespace.

To prevent that, do not allow mapping uid 0 if the process which
opened the uid_map file does not have CAP_SETFCAP, which is the capability
for setting file capabilities.

A further wrinkle:  a task can unshare its user namespace, then
open its uid_map file itself, and map (only) its own uid.  In this
case we do not have the credential from before unshare,  which was
potentially more restricted.  So, when creating a user namespace, we
record whether the creator had CAP_SETFCAP.  Then we can use that
during map_write().

With this patch:

1. unprivileged user can still unshare -Ur

ubuntu@caps:~$ unshare -Ur
root@caps:~# logout

2. root user can still unshare -Ur

ubuntu@caps:~$ sudo bash
root@caps:/home/ubuntu# unshare -Ur
root@caps:/home/ubuntu# logout

3. root user without CAP_SETFCAP cannot unshare -Ur:

root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
unable to set CAP_SETFCAP effective capability: Operation not permitted
root@caps:/home/ubuntu# unshare -Ur
unshare: write failed /proc/self/uid_map: Operation not permitted

Signed-off-by: Serge Hallyn 

Changelog:
   * fix logic in the case of writing to another task's uid_map
   * rename 'ns' to 'map_ns', and make a file_ns local variable
   * use /* comments */
   * update the CAP_SETFCAP comment in capability.h
   * rename parent_unpriv to parent_can_setfcap (and reverse the
 logic)
   * remove printks
   * clarify (i hope) the code comments
   * update capability.h comment
   * renamed parent_can_setfcap to parent_could_setfcap
   * made the check its own disallowed_0_mapping() fn
   * moved the check into new_idmap_permitted
---
 include/linux/user_namespace.h  |  3 ++
 include/uapi/linux/capability.h |  3 +-
 kernel/user_namespace.c | 61 +++--
 3 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 64cf8ebdc4ec..f6c5f784be5a 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -63,6 +63,9 @@ struct user_namespace {
kgid_t  group;
struct ns_commonns;
unsigned long   flags;
+   /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
+* in its effective capability set at the child ns creation time. */
+   boolparent_could_setfcap;
 
 #ifdef CONFIG_KEYS
/* List of joinable keyrings in this namespace.  Modification access of
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index c6ca33034147..2ddb4226cd23 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -335,7 +335,8 @@ struct vfs_ns_cap_data {
 
 #define CAP_AUDIT_CONTROL30
 
-/* Set or remove capabilities on files */
+/* Set or remove capabilities on files.
+   Map uid=0 into a child user namespace. */
 
 #define CAP_SETFCAP 31
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index af612945a4d0..8c75028a9aae 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -106,6 +106,7 @@ int create_user_ns(struct cred *new)
if (!ns)
goto fail_dec;
 
+   ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
ret = ns_alloc_inum(>ns);
if (ret)
goto fail_free;
@@ -841,6 +842,56 @@ static int sort_idmaps(struct uid_gid_map *map)
return 0;
 }
 
+/*
+ * If mapping uid 0, then file capabilities created by the new namespace will
+ * be effective in the parent namespace.  Adding file capabilities requires
+ * CAP_SETFCAP, which the child namespace will have, so creating such a
+ * mapping requires CAP_SETFCAP in the parent namespace.
+ */
+static bool disallowed_0_mapping(const struct file *file,
+struct user_namespace *map_ns,
+struct uid_gid_map *new_map)
+{
+   int idx;
+   bool zeromapping = false;
+   const struct user_namespace *file_ns = file->f_cred->user_ns;
+
+

Re: [PATCH] security: commoncap: clean up kernel-doc comments

2021-04-12 Thread Serge E. Hallyn
On Sun, Apr 11, 2021 at 05:55:28PM -0700, Randy Dunlap wrote:
> Fix kernel-doc notation in commoncap.c.
> 
> Use correct (matching) function name in comments as in code.
> Use correct function argument names in kernel-doc comments.
> Use kernel-doc's "Return:" format for function return values.
> 
> Fixes these kernel-doc warnings:
> 
> ../security/commoncap.c:1206: warning: expecting prototype for 
> cap_task_ioprio(). Prototype was for cap_task_setioprio() instead
> ../security/commoncap.c:1219: warning: expecting prototype for 
> cap_task_ioprio(). Prototype was for cap_task_setnice() instead
> 
> Signed-off-by: Randy Dunlap 
> Cc: Serge Hallyn 

Reviewed-by: Serge Hallyn 

thanks,
-serge

> Cc: James Morris 
> Cc: linux-security-mod...@vger.kernel.org
> ---
>  security/commoncap.c |   50 +++--
>  1 file changed, 33 insertions(+), 17 deletions(-)
> 
> --- linux-next-20210409.orig/security/commoncap.c
> +++ linux-next-20210409/security/commoncap.c
> @@ -50,7 +50,7 @@ static void warn_setuid_and_fcaps_mixed(
>  /**
>   * cap_capable - Determine whether a task has a particular effective 
> capability
>   * @cred: The credentials to use
> - * @ns:  The user namespace in which we need the capability
> + * @targ_ns:  The user namespace in which we need the capability
>   * @cap: The capability to check for
>   * @opts: Bitmask of options defined in include/linux/security.h
>   *
> @@ -289,7 +289,7 @@ int cap_capset(struct cred *new,
>   * affects the security markings on that inode, and if it is, should
>   * inode_killpriv() be invoked or the change rejected.
>   *
> - * Returns 1 if security.capability has a value, meaning inode_killpriv()
> + * Return: 1 if security.capability has a value, meaning inode_killpriv()
>   * is required, 0 otherwise, meaning inode_killpriv() is not required.
>   */
>  int cap_inode_need_killpriv(struct dentry *dentry)
> @@ -315,7 +315,7 @@ int cap_inode_need_killpriv(struct dentr
>   * permissions. On non-idmapped mounts or if permission checking is to be
>   * performed on the raw inode simply passs init_user_ns.
>   *
> - * Returns 0 if successful, -ve on error.
> + * Return: 0 if successful, -ve on error.
>   */
>  int cap_inode_killpriv(struct user_namespace *mnt_userns, struct dentry 
> *dentry)
>  {
> @@ -532,7 +532,7 @@ static bool validheader(size_t size, con
>   * permissions. On non-idmapped mounts or if permission checking is to be
>   * performed on the raw inode simply passs init_user_ns.
>   *
> - * If all is ok, we return the new size, on error return < 0.
> + * Return: On success, return the new size; on error, return < 0.
>   */
>  int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry 
> *dentry,
> const void **ivalue, size_t size)
> @@ -881,7 +881,9 @@ static inline bool nonroot_raised_pE(str
>   *
>   * Set up the proposed credentials for a new execution context being
>   * constructed by execve().  The proposed creds in @bprm->cred is altered,
> - * which won't take effect immediately.  Returns 0 if successful, -ve on 
> error.
> + * which won't take effect immediately.
> + *
> + * Return: 0 if successful, -ve on error.
>   */
>  int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
>  {
> @@ -1117,7 +1119,9 @@ static inline void cap_emulate_setxuid(s
>   * @flags: Indications of what has changed
>   *
>   * Fix up the results of setuid() call before the credential changes are
> - * actually applied, returning 0 to grant the changes, -ve to deny them.
> + * actually applied.
> + *
> + * Return: 0 to grant the changes, -ve to deny them.
>   */
>  int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
>  {
> @@ -1187,7 +1191,9 @@ static int cap_safe_nice(struct task_str
>   * @p: The task to affect
>   *
>   * Detemine if the requested scheduler policy change is permitted for the
> - * specified task, returning 0 if permission is granted, -ve if denied.
> + * specified task.
> + *
> + * Return: 0 if permission is granted, -ve if denied.
>   */
>  int cap_task_setscheduler(struct task_struct *p)
>  {
> @@ -1195,12 +1201,14 @@ int cap_task_setscheduler(struct task_st
>  }
>  
>  /**
> - * cap_task_ioprio - Detemine if I/O priority change is permitted
> + * cap_task_setioprio - Detemine if I/O priority change is permitted
>   * @p: The task to affect
>   * @ioprio: The I/O priority to set
>   *
>   * Detemine if the requested I/O priority change is permitted for the 
> specified
> - * task, returning 0 if permission is granted, -ve if denied.
> + * task.
> + *
> + * Return: 0 if permission is granted, -ve if denied.
>   */
>  int cap_task_setioprio(struct task_struct *p, int ioprio)
>  {
> @@ -1208,12 +1216,14 @@ int cap_task_setioprio(struct task_struc
>  }
>  
>  /**
> - * cap_task_ioprio - Detemine if task priority change is permitted
> + * cap_task_setnice - Detemine if task priority change is permitted
>   * @p: The task to affect
>   * @nice: 

Re: [PATCH] kernel: automatically split user namespace extent

2021-04-02 Thread Serge E. Hallyn
On Wed, Dec 02, 2020 at 05:12:27PM +0100, Giuseppe Scrivano wrote:
> Hi Eric,
> 
> ebied...@xmission.com (Eric W. Biederman) writes:
> 
> > Nit: The tag should have been "userns:" rather than kernel.
> >
> > Giuseppe Scrivano  writes:
> >
> >> writing to the id map fails when an extent overlaps multiple mappings
> >> in the parent user namespace, e.g.:
> >>
> >> $ cat /proc/self/uid_map
> >>  0   1000  1
> >>  1 10  65536
> >> $ unshare -U sleep 100 &
> >> [1] 1029703
> >> $ printf "0 0 100\n" | tee /proc/$!/uid_map
> >> 0 0 100
> >> tee: /proc/1029703/uid_map: Operation not permitted
> >>
> >> To prevent it from happening, automatically split an extent so that
> >> each portion fits in one extent in the parent user namespace.
> >
> > I don't see anything fundamentally wrong with relaxing this
> > restriction, but more code does have more room for bugs to hide.
> >
> > What is the advantage of relaxing this restriction?
> 
> we are running rootless containers in a namespace created with
> newuidmap/newgidmap where the mappings look like:
> 
> $ cat /proc/self/uid_map
> 0   1000  1
> 1 11  65536
> 
> users are allowed to create child user namespaces and specify the
> mappings to use.  Doing so, they often hit the issue that the mappings
> cannot overlap multiple extents in the parent user namespace.
> 
> The issue could be completely addressed in user space, but to me it
> looks like an implementation detail that user space should not know
> about.
> In addition, it would also be slower (additional read of the current
> uid_map and gid_map files) and must be implemented separately in each
> container runtime.
> 
> >> $ cat /proc/self/uid_map
> >>  0   1000  1
> >>  1 11  65536
> >> $ unshare -U sleep 100 &
> >> [1] 1552
> >> $ printf "0 0 100\n" | tee /proc/$!/uid_map
> >> 0 0 100
> >> $ cat /proc/$!/uid_map
> >>  0  0  1
> >>  1  1 99
> >>
> >> Signed-off-by: Giuseppe Scrivano 
> >> ---
> >>  kernel/user_namespace.c | 62 ++---
> >>  1 file changed, 52 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> >> index 87804e0371fe..b5542be2bd0a 100644
> >> --- a/kernel/user_namespace.c
> >> +++ b/kernel/user_namespace.c
> >> @@ -706,6 +706,41 @@ const struct seq_operations 
> >> proc_projid_seq_operations = {
> >>.show = projid_m_show,
> >>  };
> >>  
> >> +static void split_overlapping_mappings(struct uid_gid_map *parent_map,
> >> + struct uid_gid_extent *extent,
> >> + struct uid_gid_extent *overflow_extent)
> >> +{
> >> +  unsigned int idx;
> >> +
> >> +  overflow_extent->first = (u32) -1;
> >> +
> >> +  /* Split extent if it not fully contained in an extent from parent_map. 
> >>  */
> >> +  for (idx = 0; idx < parent_map->nr_extents; idx++) {
> >
> > Ouch!
> >
> > For the larger tree we perform binary searches typically and
> > here you are walking every entry unconditionally.
> >
> > It looks like this makes the write O(N^2) from O(NlogN)
> > which for a user facing function is not desirable.
> >
> > I think something like insert_and_split_extent may be ok.
> > Incorporating your loop and the part that inserts an element.
> >
> > As written this almost doubles the complexity of the code,
> > as well as making it perform much worse.  Which is a problem.
> 
> I've attempted to implement the new functionality at input validation
> time to not touch the existing security checks.
> 
> I've thought the pattern for iterating the extents was fine as I've
> taken it from mappings_overlap (even if it is used differently on an
> unsorted array).
> 
> Thanks for the hint, I'll move the new logic when map_id_range_down() is
> used and I'll send a v2.

Hi,

sorry if I miseed it.  Did you ever send a v2?


Re: [PATCH v1 3/3] KEYS: trusted: Introduce support for NXP CAAM-based trusted keys

2021-04-01 Thread Serge E. Hallyn
On Wed, Mar 24, 2021 at 09:14:02AM -0700, James Bottomley wrote:
> On Tue, 2021-03-23 at 14:07 -0400, Mimi Zohar wrote:
> > On Tue, 2021-03-23 at 17:35 +0100, Ahmad Fatoum wrote:
> > > Hello Horia,
> > > 
> > > On 21.03.21 21:48, Horia Geantă wrote:
> > > > On 3/16/2021 7:02 PM, Ahmad Fatoum wrote:
> > > > [...]
> > > > > +struct trusted_key_ops caam_trusted_key_ops = {
> > > > > + .migratable = 0, /* non-migratable */
> > > > > + .init = trusted_caam_init,
> > > > > + .seal = trusted_caam_seal,
> > > > > + .unseal = trusted_caam_unseal,
> > > > > + .exit = trusted_caam_exit,
> > > > > +};
> > > > caam has random number generation capabilities, so it's worth
> > > > using that
> > > > by implementing .get_random.
> > > 
> > > If the CAAM HWRNG is already seeding the kernel RNG, why not use
> > > the kernel's?
> > > 
> > > Makes for less code duplication IMO.
> > 
> > Using kernel RNG, in general, for trusted keys has been discussed
> > before.   Please refer to Dave Safford's detailed explanation for not
> > using it [1].
> > 
> > thanks,
> > 
> > Mimi
> > 
> > [1] 
> > https://lore.kernel.org/linux-integrity/bca04d5d9a3b764c9b7405bba4d4a3c035f2a...@alpmbapa12.e2k.ad.ge.com/
> 
> I still don't think relying on one source of randomness to be
> cryptographically secure is a good idea.  The fear of bugs in the
> kernel entropy pool is reasonable, but since it's widely used they're
> unlikely to persist very long.

I'm not sure I agree - remember
https://www.schneier.com/blog/archives/2008/05/random_number_b.html ?  You'd
surely expect that to have been found quickly.

>   Studies have shown that some TPMs
> (notably the chinese manufactured ones) have suspicious failures in
> their RNGs:
> 
> https://www.researchgate.net/publication/45934562_Benchmarking_the_True_Random_Number_Generator_of_TPM_Chips
> 
> And most cryptograhpers recommend using a TPM for entropy mixing rather
> than directly:
> 
> https://blog.cryptographyengineering.com/category/rngs/
> 
> The TPMFail paper also shows that in spite of NIST certification
> things can go wrong with a TPM:
> 
> https://tpm.fail/

In this thread I've seen argument over "which is better" and "which is user 
api",
but noone's mentioned fips.  Unfortunately, so long as kernel rng refuses to be
fips-friendly (cf https://lkml.org/lkml/2020/9/21/157), making CAAM based 
trusted
keys depend on kernel rng would make them impossible to use in fips certified
applications without a forked kernel.

So I definitely am in favor of a config or kernel command line option to drive
which rng to use.


Re: [PATCH v28 07/12] landlock: Support filesystem access-control

2021-02-19 Thread Serge E. Hallyn
On Wed, Feb 10, 2021 at 09:17:25PM +0100, Mickaël Salaün wrote:
> 
> On 10/02/2021 20:36, Serge E. Hallyn wrote:
> > On Tue, Feb 02, 2021 at 05:27:05PM +0100, Mickaël Salaün wrote:
> >> From: Mickaël Salaün 
> >>
> >> Thanks to the Landlock objects and ruleset, it is possible to identify
> >> inodes according to a process's domain.  To enable an unprivileged
> > 
> > This throws me off a bit.  "identify inodes according to a process's 
> > domain".
> > What exactly does it mean?  "identify" how ?
> 
> A domain is a set of rules (i.e. layers of rulesets) enforced on a set
> of threads. Inodes are tagged per domain (i.e. not system-wide) and
> actions are restricted thanks to these tags, which form rules. It means
> that the created access-controls are scoped to a set of threads.

Thanks, that's helpful.  To me it would be much clearer if you used the word
'tagged' :

  Using the Landlock objects and ruleset, it is possible to tag inodes
  according to a process's domain.

> >> process to express a file hierarchy, it first needs to open a directory
> >> (or a file) and pass this file descriptor to the kernel through
> >> landlock_add_rule(2).  When checking if a file access request is
> >> allowed, we walk from the requested dentry to the real root, following
> >> the different mount layers.  The access to each "tagged" inodes are
> >> collected according to their rule layer level, and ANDed to create
> >> access to the requested file hierarchy.  This makes possible to identify
> >> a lot of files without tagging every inodes nor modifying the
> >> filesystem, while still following the view and understanding the user
> >> has from the filesystem.
> >>
> >> Add a new ARCH_EPHEMERAL_INODES for UML because it currently does not
> >> keep the same struct inodes for the same inodes whereas these inodes are
> >> in use.
> > 
> > -serge
> > 


Re: [PATCH v28 07/12] landlock: Support filesystem access-control

2021-02-10 Thread Serge E. Hallyn
On Tue, Feb 02, 2021 at 05:27:05PM +0100, Mickaël Salaün wrote:
> From: Mickaël Salaün 
> 
> Thanks to the Landlock objects and ruleset, it is possible to identify
> inodes according to a process's domain.  To enable an unprivileged

This throws me off a bit.  "identify inodes according to a process's domain".
What exactly does it mean?  "identify" how ?

> process to express a file hierarchy, it first needs to open a directory
> (or a file) and pass this file descriptor to the kernel through
> landlock_add_rule(2).  When checking if a file access request is
> allowed, we walk from the requested dentry to the real root, following
> the different mount layers.  The access to each "tagged" inodes are
> collected according to their rule layer level, and ANDed to create
> access to the requested file hierarchy.  This makes possible to identify
> a lot of files without tagging every inodes nor modifying the
> filesystem, while still following the view and understanding the user
> has from the filesystem.
> 
> Add a new ARCH_EPHEMERAL_INODES for UML because it currently does not
> keep the same struct inodes for the same inodes whereas these inodes are
> in use.

-serge


Re: [PATCH v28 06/12] fs,security: Add sb_delete hook

2021-02-06 Thread Serge E. Hallyn
On Fri, Feb 05, 2021 at 03:57:37PM +0100, Mickaël Salaün wrote:
> 
> On 05/02/2021 15:21, Serge E. Hallyn wrote:
> > On Tue, Feb 02, 2021 at 05:27:04PM +0100, Mickaël Salaün wrote:
> >> From: Mickaël Salaün 
> >>
> >> The sb_delete security hook is called when shutting down a superblock,
> >> which may be useful to release kernel objects tied to the superblock's
> >> lifetime (e.g. inodes).
> >>
> >> This new hook is needed by Landlock to release (ephemerally) tagged
> >> struct inodes.  This comes from the unprivileged nature of Landlock
> >> described in the next commit.
> >>
> >> Cc: Al Viro 
> >> Cc: James Morris 
> >> Cc: Kees Cook 
> >> Cc: Serge E. Hallyn 
> > 
> > One note below, but
> > 
> > Acked-by: Serge Hallyn 
> > 
> >> Signed-off-by: Mickaël Salaün 
> >> Reviewed-by: Jann Horn 
> >> ---
> >>
> >> Changes since v22:
> >> * Add Reviewed-by: Jann Horn 
> >>
> >> Changes since v17:
> >> * Initial patch to replace the direct call to landlock_release_inodes()
> >>   (requested by James Morris).
> >>   
> >> https://lore.kernel.org/lkml/alpine.lrh.2.21.2005150536440.7...@namei.org/
> >> ---
> >>  fs/super.c| 1 +
> >>  include/linux/lsm_hook_defs.h | 1 +
> >>  include/linux/lsm_hooks.h | 2 ++
> >>  include/linux/security.h  | 4 
> >>  security/security.c   | 5 +
> >>  5 files changed, 13 insertions(+)
> >>
> >> diff --git a/fs/super.c b/fs/super.c
> >> index 2c6cdea2ab2d..c3c5178cde65 100644
> >> --- a/fs/super.c
> >> +++ b/fs/super.c
> >> @@ -454,6 +454,7 @@ void generic_shutdown_super(struct super_block *sb)
> >>evict_inodes(sb);
> >>/* only nonzero refcount inodes can have marks */
> >>fsnotify_sb_delete(sb);
> >> +  security_sb_delete(sb);
> >>  
> >>if (sb->s_dio_done_wq) {
> >>destroy_workqueue(sb->s_dio_done_wq);
> >> diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
> >> index 7aaa753b8608..32472b3849bc 100644
> >> --- a/include/linux/lsm_hook_defs.h
> >> +++ b/include/linux/lsm_hook_defs.h
> >> @@ -59,6 +59,7 @@ LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
> >>  LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc,
> >> struct fs_parameter *param)
> >>  LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb)
> >> +LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb)
> >>  LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb)
> >>  LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
> >>  LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts)
> >> diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
> >> index 970106d98306..e339b201f79b 100644
> >> --- a/include/linux/lsm_hooks.h
> >> +++ b/include/linux/lsm_hooks.h
> >> @@ -108,6 +108,8 @@
> >>   *allocated.
> >>   *@sb contains the super_block structure to be modified.
> >>   *Return 0 if operation was successful.
> >> + * @sb_delete:
> >> + *Release objects tied to a superblock (e.g. inodes).
> > 
> > It's customary here to add the line detailing the @sb argument.
> 
> What about "@sb contains the super_block structure being released."?

That's good.  Thanks.

> > 
> >>   * @sb_free_security:
> >>   *Deallocate and clear the sb->s_security field.
> >>   *@sb contains the super_block structure to be modified.
> >> diff --git a/include/linux/security.h b/include/linux/security.h
> >> index c35ea0ffccd9..c41a94e29b62 100644
> >> --- a/include/linux/security.h
> >> +++ b/include/linux/security.h
> >> @@ -288,6 +288,7 @@ void security_bprm_committed_creds(struct linux_binprm 
> >> *bprm);
> >>  int security_fs_context_dup(struct fs_context *fc, struct fs_context 
> >> *src_fc);
> >>  int security_fs_context_parse_param(struct fs_context *fc, struct 
> >> fs_parameter *param);
> >>  int security_sb_alloc(struct super_block *sb);
> >> +void security_sb_delete(struct super_block *sb);
> >>  void security_sb_free(struct super_block *sb);
> >>  void security_free_mnt_opts(void **mnt_opts);
> >>  int security_sb_eat_lsm_opts(char *options, void **mnt_opts);
> >> @@ -620,6 +621,9 @@ static inline int security_sb_alloc(struct super_block 
> >> *sb)
> >>return 0;
> >>  }
> >>  
> >> +static inline void security_sb_delete(struct super_block *sb)
> >> +{ }
> >> +
> >>  static inline void security_sb_free(struct super_block *sb)
> >>  { }
> >>  
> >> diff --git a/security/security.c b/security/security.c
> >> index 9f979d4afe6c..1b4a73b2549a 100644
> >> --- a/security/security.c
> >> +++ b/security/security.c
> >> @@ -900,6 +900,11 @@ int security_sb_alloc(struct super_block *sb)
> >>return rc;
> >>  }
> >>  
> >> +void security_sb_delete(struct super_block *sb)
> >> +{
> >> +  call_void_hook(sb_delete, sb);
> >> +}
> >> +
> >>  void security_sb_free(struct super_block *sb)
> >>  {
> >>call_void_hook(sb_free_security, sb);
> >> -- 
> >> 2.30.0


Re: [PATCH v28 05/12] LSM: Infrastructure management of the superblock

2021-02-05 Thread Serge E. Hallyn
On Tue, Feb 02, 2021 at 05:27:03PM +0100, Mickaël Salaün wrote:
> From: Casey Schaufler 
> 
> Move management of the superblock->sb_security blob out of the
> individual security modules and into the security infrastructure.
> Instead of allocating the blobs from within the modules, the modules
> tell the infrastructure how much space is required, and the space is
> allocated there.
> 
> Cc: Kees Cook 
> Cc: John Johansen 
> Signed-off-by: Casey Schaufler 
> Signed-off-by: Mickaël Salaün 
> Reviewed-by: Stephen Smalley 

Acked-by: Serge Hallyn 

I wonder how many out of tree modules this will impact :)  Actually
if some new incoming module does an rcu callback to free the
sb_security, then the security_sb_free will need an update, but
that seems unlikely.

> ---
> 
> Changes since v26:
> * Rebase on commit b159e86b5a2a ("selinux: drop super_block backpointer
>   from superblock_security_struct").  No change in the patch itself,
>   only a trivial conflict because of an updated nearby line in
>   selinux_set_mnt_opts() variable declarations.
> 
> Changes since v20:
> * Remove all Reviewed-by except Stephen Smalley:
>   
> https://lore.kernel.org/lkml/CAEjxPJ7ARJO57MBW66=xsBzMMRb=9ulgqock5eskhcaivmx...@mail.gmail.com/
> * Cosmetic fix in the commit message.
> 
> Changes since v17:
> * Rebase the original LSM stacking patch from v5.3 to v5.7: I fixed some
>   diff conflicts caused by code moves and function renames in
>   selinux/include/objsec.h and selinux/hooks.c .  I checked that it
>   builds but I didn't test the changes for SELinux nor SMACK.
>   https://lore.kernel.org/r/20190829232935.7099-2-ca...@schaufler-ca.com
> ---
>  include/linux/lsm_hooks.h |  1 +
>  security/security.c   | 46 
>  security/selinux/hooks.c  | 58 ---
>  security/selinux/include/objsec.h |  6 
>  security/selinux/ss/services.c|  3 +-
>  security/smack/smack.h|  6 
>  security/smack/smack_lsm.c| 35 +--
>  7 files changed, 85 insertions(+), 70 deletions(-)
> 
> diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
> index a19adef1f088..970106d98306 100644
> --- a/include/linux/lsm_hooks.h
> +++ b/include/linux/lsm_hooks.h
> @@ -1563,6 +1563,7 @@ struct lsm_blob_sizes {
>   int lbs_cred;
>   int lbs_file;
>   int lbs_inode;
> + int lbs_superblock;
>   int lbs_ipc;
>   int lbs_msg_msg;
>   int lbs_task;
> diff --git a/security/security.c b/security/security.c
> index 7b09cfbae94f..9f979d4afe6c 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -203,6 +203,7 @@ static void __init lsm_set_blob_sizes(struct 
> lsm_blob_sizes *needed)
>   lsm_set_blob_size(>lbs_inode, _sizes.lbs_inode);
>   lsm_set_blob_size(>lbs_ipc, _sizes.lbs_ipc);
>   lsm_set_blob_size(>lbs_msg_msg, _sizes.lbs_msg_msg);
> + lsm_set_blob_size(>lbs_superblock, _sizes.lbs_superblock);
>   lsm_set_blob_size(>lbs_task, _sizes.lbs_task);
>  }
>  
> @@ -333,12 +334,13 @@ static void __init ordered_lsm_init(void)
>   for (lsm = ordered_lsms; *lsm; lsm++)
>   prepare_lsm(*lsm);
>  
> - init_debug("cred blob size = %d\n", blob_sizes.lbs_cred);
> - init_debug("file blob size = %d\n", blob_sizes.lbs_file);
> - init_debug("inode blob size= %d\n", blob_sizes.lbs_inode);
> - init_debug("ipc blob size  = %d\n", blob_sizes.lbs_ipc);
> - init_debug("msg_msg blob size  = %d\n", blob_sizes.lbs_msg_msg);
> - init_debug("task blob size = %d\n", blob_sizes.lbs_task);
> + init_debug("cred blob size   = %d\n", blob_sizes.lbs_cred);
> + init_debug("file blob size   = %d\n", blob_sizes.lbs_file);
> + init_debug("inode blob size  = %d\n", blob_sizes.lbs_inode);
> + init_debug("ipc blob size= %d\n", blob_sizes.lbs_ipc);
> + init_debug("msg_msg blob size= %d\n", blob_sizes.lbs_msg_msg);
> + init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock);
> + init_debug("task blob size   = %d\n", blob_sizes.lbs_task);
>  
>   /*
>* Create any kmem_caches needed for blobs
> @@ -670,6 +672,27 @@ static void __init lsm_early_task(struct task_struct 
> *task)
>   panic("%s: Early task alloc failed.\n", __func__);
>  }
>  
> +/**
> + * lsm_superblock_alloc - allocate a composite superblock blob
> + * @sb: the superblock that needs a blob
> + *
> + * Allocate the superblock blob for all the modules
> + *
> + * Returns 0, or -ENOMEM if memory can't be allocated.
> + */
> +static int lsm_superblock_alloc(struct super_block *sb)
> +{
> + if (blob_sizes.lbs_superblock == 0) {
> + sb->s_security = NULL;
> + return 0;
> + }
> +
> + sb->s_security = kzalloc(blob_sizes.lbs_superblock, GFP_KERNEL);
> + if (sb->s_security == NULL)
> + return -ENOMEM;
> + return 0;
> +}
> +
>  /*
>   * 

Re: [PATCH v28 06/12] fs,security: Add sb_delete hook

2021-02-05 Thread Serge E. Hallyn
On Tue, Feb 02, 2021 at 05:27:04PM +0100, Mickaël Salaün wrote:
> From: Mickaël Salaün 
> 
> The sb_delete security hook is called when shutting down a superblock,
> which may be useful to release kernel objects tied to the superblock's
> lifetime (e.g. inodes).
> 
> This new hook is needed by Landlock to release (ephemerally) tagged
> struct inodes.  This comes from the unprivileged nature of Landlock
> described in the next commit.
> 
> Cc: Al Viro 
> Cc: James Morris 
> Cc: Kees Cook 
> Cc: Serge E. Hallyn 

One note below, but

Acked-by: Serge Hallyn 

> Signed-off-by: Mickaël Salaün 
> Reviewed-by: Jann Horn 
> ---
> 
> Changes since v22:
> * Add Reviewed-by: Jann Horn 
> 
> Changes since v17:
> * Initial patch to replace the direct call to landlock_release_inodes()
>   (requested by James Morris).
>   https://lore.kernel.org/lkml/alpine.lrh.2.21.2005150536440.7...@namei.org/
> ---
>  fs/super.c| 1 +
>  include/linux/lsm_hook_defs.h | 1 +
>  include/linux/lsm_hooks.h | 2 ++
>  include/linux/security.h  | 4 
>  security/security.c   | 5 +
>  5 files changed, 13 insertions(+)
> 
> diff --git a/fs/super.c b/fs/super.c
> index 2c6cdea2ab2d..c3c5178cde65 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -454,6 +454,7 @@ void generic_shutdown_super(struct super_block *sb)
>   evict_inodes(sb);
>   /* only nonzero refcount inodes can have marks */
>   fsnotify_sb_delete(sb);
> + security_sb_delete(sb);
>  
>   if (sb->s_dio_done_wq) {
>   destroy_workqueue(sb->s_dio_done_wq);
> diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
> index 7aaa753b8608..32472b3849bc 100644
> --- a/include/linux/lsm_hook_defs.h
> +++ b/include/linux/lsm_hook_defs.h
> @@ -59,6 +59,7 @@ LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
>  LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc,
>struct fs_parameter *param)
>  LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb)
> +LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb)
>  LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb)
>  LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
>  LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts)
> diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
> index 970106d98306..e339b201f79b 100644
> --- a/include/linux/lsm_hooks.h
> +++ b/include/linux/lsm_hooks.h
> @@ -108,6 +108,8 @@
>   *   allocated.
>   *   @sb contains the super_block structure to be modified.
>   *   Return 0 if operation was successful.
> + * @sb_delete:
> + *   Release objects tied to a superblock (e.g. inodes).

It's customary here to add the line detailing the @sb argument.

>   * @sb_free_security:
>   *   Deallocate and clear the sb->s_security field.
>   *   @sb contains the super_block structure to be modified.
> diff --git a/include/linux/security.h b/include/linux/security.h
> index c35ea0ffccd9..c41a94e29b62 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -288,6 +288,7 @@ void security_bprm_committed_creds(struct linux_binprm 
> *bprm);
>  int security_fs_context_dup(struct fs_context *fc, struct fs_context 
> *src_fc);
>  int security_fs_context_parse_param(struct fs_context *fc, struct 
> fs_parameter *param);
>  int security_sb_alloc(struct super_block *sb);
> +void security_sb_delete(struct super_block *sb);
>  void security_sb_free(struct super_block *sb);
>  void security_free_mnt_opts(void **mnt_opts);
>  int security_sb_eat_lsm_opts(char *options, void **mnt_opts);
> @@ -620,6 +621,9 @@ static inline int security_sb_alloc(struct super_block 
> *sb)
>   return 0;
>  }
>  
> +static inline void security_sb_delete(struct super_block *sb)
> +{ }
> +
>  static inline void security_sb_free(struct super_block *sb)
>  { }
>  
> diff --git a/security/security.c b/security/security.c
> index 9f979d4afe6c..1b4a73b2549a 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -900,6 +900,11 @@ int security_sb_alloc(struct super_block *sb)
>   return rc;
>  }
>  
> +void security_sb_delete(struct super_block *sb)
> +{
> + call_void_hook(sb_delete, sb);
> +}
> +
>  void security_sb_free(struct super_block *sb)
>  {
>   call_void_hook(sb_free_security, sb);
> -- 
> 2.30.0


Re: [PATCH v28 04/12] landlock: Add ptrace restrictions

2021-02-05 Thread Serge E. Hallyn
On Tue, Feb 02, 2021 at 05:27:02PM +0100, Mickaël Salaün wrote:
> From: Mickaël Salaün 
> 
> Using ptrace(2) and related debug features on a target process can lead
> to a privilege escalation.  Indeed, ptrace(2) can be used by an attacker
> to impersonate another task and to remain undetected while performing
> malicious activities.  Thanks to  ptrace_may_access(), various part of
> the kernel can check if a tracer is more privileged than a tracee.
> 
> A landlocked process has fewer privileges than a non-landlocked process
> and must then be subject to additional restrictions when manipulating
> processes. To be allowed to use ptrace(2) and related syscalls on a
> target process, a landlocked process must have a subset of the target
> process's rules (i.e. the tracee must be in a sub-domain of the tracer).
> 
> Cc: James Morris 
> Cc: Kees Cook 
> Cc: Serge E. Hallyn 

Acked-by: Serge Hallyn 

Thanks, I appreciate that things are well named and easy to reason
about.

> Signed-off-by: Mickaël Salaün 
> Reviewed-by: Jann Horn 
> ---
> 
> Changes since v25:
> * Rename function to landlock_add_ptrace_hooks().
> 
> Changes since v22:
> * Add Reviewed-by: Jann Horn 
> 
> Changes since v21:
> * Fix copyright dates.
> 
> Changes since v14:
> * Constify variables.
> 
> Changes since v13:
> * Make the ptrace restriction mandatory, like in the v10.
> * Remove the eBPF dependency.
> 
> Previous changes:
> https://lore.kernel.org/lkml/20191104172146.30797-5-...@digikod.net/
> ---
>  security/landlock/Makefile |   2 +-
>  security/landlock/ptrace.c | 120 +
>  security/landlock/ptrace.h |  14 +
>  security/landlock/setup.c  |   2 +
>  4 files changed, 137 insertions(+), 1 deletion(-)
>  create mode 100644 security/landlock/ptrace.c
>  create mode 100644 security/landlock/ptrace.h
> 
> diff --git a/security/landlock/Makefile b/security/landlock/Makefile
> index 041ea242e627..f1d1eb72fa76 100644
> --- a/security/landlock/Makefile
> +++ b/security/landlock/Makefile
> @@ -1,4 +1,4 @@
>  obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
>  
>  landlock-y := setup.o object.o ruleset.o \
> - cred.o
> + cred.o ptrace.o
> diff --git a/security/landlock/ptrace.c b/security/landlock/ptrace.c
> new file mode 100644
> index ..f55b82446de2
> --- /dev/null
> +++ b/security/landlock/ptrace.c
> @@ -0,0 +1,120 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Landlock LSM - Ptrace hooks
> + *
> + * Copyright © 2017-2020 Mickaël Salaün 
> + * Copyright © 2019-2020 ANSSI
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "common.h"
> +#include "cred.h"
> +#include "ptrace.h"
> +#include "ruleset.h"
> +#include "setup.h"
> +
> +/**
> + * domain_scope_le - Checks domain ordering for scoped ptrace
> + *
> + * @parent: Parent domain.
> + * @child: Potential child of @parent.
> + *
> + * Checks if the @parent domain is less or equal to (i.e. an ancestor, which
> + * means a subset of) the @child domain.
> + */
> +static bool domain_scope_le(const struct landlock_ruleset *const parent,
> + const struct landlock_ruleset *const child)
> +{
> + const struct landlock_hierarchy *walker;
> +
> + if (!parent)
> + return true;
> + if (!child)
> + return false;
> + for (walker = child->hierarchy; walker; walker = walker->parent) {
> + if (walker == parent->hierarchy)
> + /* @parent is in the scoped hierarchy of @child. */
> + return true;
> + }
> + /* There is no relationship between @parent and @child. */
> + return false;
> +}
> +
> +static bool task_is_scoped(const struct task_struct *const parent,
> + const struct task_struct *const child)
> +{
> + bool is_scoped;
> + const struct landlock_ruleset *dom_parent, *dom_child;
> +
> + rcu_read_lock();
> + dom_parent = landlock_get_task_domain(parent);
> + dom_child = landlock_get_task_domain(child);
> + is_scoped = domain_scope_le(dom_parent, dom_child);
> + rcu_read_unlock();
> + return is_scoped;
> +}
> +
> +static int task_ptrace(const struct task_struct *const parent,
> + const struct task_struct *const child)
> +{
> + /* Quick return for non-landlocked tasks. */
> + if (!landlocked(parent))
> + return 0;
> + if (task_is_scoped(parent, child))
> + return 0;
> + return -EPERM;
> +}
>

Re: [PATCH v28 03/12] landlock: Set up the security framework and manage credentials

2021-02-03 Thread Serge E. Hallyn
On Tue, Feb 02, 2021 at 05:27:01PM +0100, Mickaël Salaün wrote:
> From: Mickaël Salaün 
> 
> Process's credentials point to a Landlock domain, which is underneath
> implemented with a ruleset.  In the following commits, this domain is
> used to check and enforce the ptrace and filesystem security policies.
> A domain is inherited from a parent to its child the same way a thread
> inherits a seccomp policy.
> 
> Cc: James Morris 
> Cc: Kees Cook 
> Cc: Serge E. Hallyn 

Acked-by: Serge Hallyn 

> Signed-off-by: Mickaël Salaün 
> Reviewed-by: Jann Horn 
> ---
> 
> Changes since v25:
> * Rename function to landlock_add_cred_hooks().
> 
> Changes since v23:
> * Add an early check for the current domain in hook_cred_free() to avoid
>   superfluous call.
> * Cosmetic cleanup to make the code more readable.
> 
> Changes since v22:
> * Add Reviewed-by: Jann Horn 
> 
> Changes since v21:
> * Fix copyright dates.
> 
> Changes since v17:
> * Constify returned domain pointers from landlock_get_current_domain()
>   and landlock_get_task_domain() helpers.
> 
> Changes since v15:
> * Optimize landlocked() for current thread.
> * Display the greeting message when everything is initialized.
> 
> Changes since v14:
> * Uses pr_fmt from common.h .
> * Constify variables.
> * Remove useless NULL initialization.
> 
> Changes since v13:
> * totally get ride of the seccomp dependency
> * only keep credential management and LSM setup.
> 
> Previous changes:
> https://lore.kernel.org/lkml/20191104172146.30797-4-...@digikod.net/
> ---
>  security/Kconfig   | 10 +++
>  security/landlock/Makefile |  3 +-
>  security/landlock/common.h | 20 +
>  security/landlock/cred.c   | 46 ++
>  security/landlock/cred.h   | 58 ++
>  security/landlock/setup.c  | 31 
>  security/landlock/setup.h  | 16 +++
>  7 files changed, 178 insertions(+), 6 deletions(-)
>  create mode 100644 security/landlock/common.h
>  create mode 100644 security/landlock/cred.c
>  create mode 100644 security/landlock/cred.h
>  create mode 100644 security/landlock/setup.c
>  create mode 100644 security/landlock/setup.h
> 
> diff --git a/security/Kconfig b/security/Kconfig
> index 15a4342b5d01..0ced7fd33e4d 100644
> --- a/security/Kconfig
> +++ b/security/Kconfig
> @@ -278,11 +278,11 @@ endchoice
>  
>  config LSM
>   string "Ordered list of enabled LSMs"
> - default 
> "lockdown,yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor,bpf" 
> if DEFAULT_SECURITY_SMACK
> - default 
> "lockdown,yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo,bpf" 
> if DEFAULT_SECURITY_APPARMOR
> - default "lockdown,yama,loadpin,safesetid,integrity,tomoyo,bpf" if 
> DEFAULT_SECURITY_TOMOYO
> - default "lockdown,yama,loadpin,safesetid,integrity,bpf" if 
> DEFAULT_SECURITY_DAC
> - default 
> "lockdown,yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor,bpf"
> + default 
> "landlock,lockdown,yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor,bpf"
>  if DEFAULT_SECURITY_SMACK
> + default 
> "landlock,lockdown,yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo,bpf"
>  if DEFAULT_SECURITY_APPARMOR
> + default "landlock,lockdown,yama,loadpin,safesetid,integrity,tomoyo,bpf" 
> if DEFAULT_SECURITY_TOMOYO
> + default "landlock,lockdown,yama,loadpin,safesetid,integrity,bpf" if 
> DEFAULT_SECURITY_DAC
> + default 
> "landlock,lockdown,yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor,bpf"
>   help
> A comma-separated list of LSMs, in initialization order.
> Any LSMs left off this list will be ignored. This can be
> diff --git a/security/landlock/Makefile b/security/landlock/Makefile
> index d846eba445bb..041ea242e627 100644
> --- a/security/landlock/Makefile
> +++ b/security/landlock/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
>  
> -landlock-y := object.o ruleset.o
> +landlock-y := setup.o object.o ruleset.o \
> + cred.o
> diff --git a/security/landlock/common.h b/security/landlock/common.h
> new file mode 100644
> index ..5dc0fe15707d
> --- /dev/null
> +++ b/security/landlock/common.h
> @@ -0,0 +1,20 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Landlock LSM - Common constants and helpers
> + *
> + * Copyright © 2016-2020 Mickaël Salaün 
> + * Copyright © 2018-2020 ANSSI
> + */
> +
> +#ifndef _SECURITY_LANDLOCK_COMMON_H
> +#

Re: [PATCH v28 02/12] landlock: Add ruleset and domain management

2021-02-03 Thread Serge E. Hallyn
On Tue, Feb 02, 2021 at 05:27:00PM +0100, Mickaël Salaün wrote:
> From: Mickaël Salaün 
> 
> A Landlock ruleset is mainly a red-black tree with Landlock rules as
> nodes.  This enables quick update and lookup to match a requested
> access, e.g. to a file.  A ruleset is usable through a dedicated file
> descriptor (cf. following commit implementing syscalls) which enables a
> process to create and populate a ruleset with new rules.
> 
> A domain is a ruleset tied to a set of processes.  This group of rules
> defines the security policy enforced on these processes and their future
> children.  A domain can transition to a new domain which is the
> intersection of all its constraints and those of a ruleset provided by
> the current process.  This modification only impact the current process.
> This means that a process can only gain more constraints (i.e. lose
> accesses) over time.
> 
> Cc: James Morris 
> Cc: Jann Horn 
> Cc: Kees Cook 
> Cc: Serge E. Hallyn 

Acked-by: Serge Hallyn 

> Signed-off-by: Mickaël Salaün 
> ---
> 
> Changes since v27:
> * Fix domains with layers of non-overlapping access rights.
> * Add stricter limit checks (same semantic).
> * Change the grow direction of a rule layer stack to make it the same as
>   the new ruleset fs_access_masks stack (cosmetic change).
> * Cosmetic fix for a comment block.
> 
> Changes since v26:
> * Fix spelling.
> 
> Changes since v25:
> * Add build-time checks for the num_layers and num_rules variables
>   according to LANDLOCK_MAX_NUM_LAYERS and LANDLOCK_MAX_NUM_RULES, and
>   move these limits to a dedicated file.
> * Cosmetic variable renames.
> 
> Changes since v24:
> * Update struct landlock_rule with a layer stack.  This reverts "Always
>   intersect access rights" from v24 and also adds the ability to tie
>   access rights with their policy layer.  As noted by Jann Horn, always
>   intersecting access rights made some use cases uselessly more
>   difficult to handle in user space.  Thanks to this new stack, we still
>   have a deterministic policy behavior whatever their level in the stack
>   of policies, while using a "union" of accesses when building a
>   ruleset.  The implementation use a FAM to keep the access checks quick
>   and memory efficient (4 bytes per layer per inode).  Update
>   insert_rule() accordingly.
> 
> Changes since v23:
> * Always intersect access rights.  Following the filesystem change
>   logic, make ruleset updates more consistent by always intersecting
>   access rights (boolean AND) instead of combining them (boolean OR) for
>   the same layer.  This defensive approach could also help avoid user
>   space to inadvertently allow multiple access rights for the same
>   object (e.g.  write and execute access on a path hierarchy) instead of
>   dealing with such inconsistency.  This can happen when there is no
>   deduplication of objects (e.g. paths and underlying inodes) whereas
>   they get different access rights with landlock_add_rule(2).
> * Add extra checks to make sure that:
>   - there is always an (allocated) object in each used rules;
>   - when updating a ruleset with a new rule (i.e. not merging two
> rulesets), the ruleset doesn't contain multiple layers.
> * Hide merge parameter from the public landlock_insert_rule() API.  This
>   helps avoid misuse of this function.
> * Replace a remaining hardcoded 1 with SINGLE_DEPTH_NESTING.
> 
> Changes since v22:
> * Explicitely use RB_ROOT and SINGLE_DEPTH_NESTING (suggested by Jann
>   Horn).
> * Improve comments and fix spelling (suggested by Jann Horn).
> 
> Changes since v21:
> * Add and clean up comments.
> 
> Changes since v18:
> * Account rulesets to kmemcg.
> * Remove struct holes.
> * Cosmetic changes.
> 
> Changes since v17:
> * Move include/uapi/linux/landlock.h and _LANDLOCK_ACCESS_FS_* to a
>   following patch.
> 
> Changes since v16:
> * Allow enforcement of empty ruleset, which enables deny-all policies.
> 
> Changes since v15:
> * Replace layer_levels and layer_depth with a bitfield of layers, cf.
>   filesystem commit.
> * Rename the LANDLOCK_ACCESS_FS_{UNLINK,RMDIR} with
>   LANDLOCK_ACCESS_FS_REMOVE_{FILE,DIR} because it makes sense to use
>   them for the action of renaming a file or a directory, which may lead
>   to the removal of the source file or directory.  Removes the
>   LANDLOCK_ACCESS_FS_{LINK_TO,RENAME_FROM,RENAME_TO} which are now
>   replaced with LANDLOCK_ACCESS_FS_REMOVE_{FILE,DIR} and
>   LANDLOCK_ACCESS_FS_MAKE_* .
> * Update the documentation accordingly and highlight how the access
>   rights are taken into account.
> * Change nb_rules from atomic_t to u32 because it is not use anymore by
>   s

Re: [PATCH v28 08/12] landlock: Add syscall implementations

2021-02-03 Thread Serge E. Hallyn
On Tue, Feb 02, 2021 at 05:27:06PM +0100, Mickaël Salaün wrote:
> From: Mickaël Salaün 
> 
> These 3 system calls are designed to be used by unprivileged processes
> to sandbox themselves:
> * landlock_create_ruleset(2): Creates a ruleset and returns its file
>   descriptor.
> * landlock_add_rule(2): Adds a rule (e.g. file hierarchy access) to a
>   ruleset, identified by the dedicated file descriptor.
> * landlock_restrict_self(2): Enforces a ruleset on the calling thread
>   and its future children (similar to seccomp).  This syscall has the
>   same usage restrictions as seccomp(2): the caller must have the
>   no_new_privs attribute set or have CAP_SYS_ADMIN in the current user
>   namespace.

Was looking through the set for this :)  thanks.

> 
> All these syscalls have a "flags" argument (not currently used) to
> enable extensibility.
> 
> Here are the motivations for these new syscalls:
> * A sandboxed process may not have access to file systems, including
>   /dev, /sys or /proc, but it should still be able to add more
>   restrictions to itself.
> * Neither prctl(2) nor seccomp(2) (which was used in a previous version)
>   fit well with the current definition of a Landlock security policy.
> 
> All passed structs (attributes) are checked at build time to ensure that
> they don't contain holes and that they are aligned the same way for each
> architecture.
> 
> See the user and kernel documentation for more details (provided by a
> following commit):
> * Documentation/userspace-api/landlock.rst
> * Documentation/security/landlock.rst
> 
> Cc: Arnd Bergmann 
> Cc: James Morris 
> Cc: Jann Horn 
> Cc: Kees Cook 
> Cc: Serge E. Hallyn 

Acked-by: Serge Hallyn 

> Signed-off-by: Mickaël Salaün 
> ---
> 
> Changes since v27:
> * Forbid creation of rules with an empty allowed_access value because
>   they are now ignored (since v26) in path walks.
> * Rename landlock_enforce_ruleset_self(2) to landlock_restrict_self(2):
>   shorter and consistent with the two other syscalls (i.e. verb + direct
>   object).
> * Update ruleset access check according to the new access stack.
> * Improve landlock_add_rule(2) documentation.
> * Fix comment.
> * Remove Reviewed-by Jann Horn because of the above changes.
> 
> Changes since v26:
> * Rename landlock_enforce_ruleset_current(2) to
>   landlock_enforce_ruleset_self(2).  "current" makes sense for a kernel
>   developer, but much less from a user space developer stand point.
>   "self" is widely used to refer to the current task (e.g. /proc/self).
>   "current" may refer to temporal properties, which could be added later
>   to this syscall flags (cf. /proc/self/attr/{current,exec}).
> * Simplify build_check_abi().
> * Rename syscall.c to syscalls.c .
> * Use less ambiguous comments.
> * Fix spelling.
> 
> Changes since v25:
> * Revert build_check_abi() as non-inline to trigger a warning if it is
>   not called.
> * Use the new limit names.
> 
> Changes since v24:
> * Add Reviewed-by: Jann Horn 
> * Set build_check_abi() as inline.
> 
> Changes since v23:
> * Rewrite get_ruleset_from_fd() to please the 0-DAY CI Kernel Test
>   Service that reported an uninitialized variable (false positive):
>   
> https://lore.kernel.org/linux-security-module/202011101854.zgbwwusk-...@intel.com/
>   Anyway, it is cleaner like this.
> * Add a comment about E2BIG which can be returned by
>   landlock_enforce_ruleset_current(2) when there is no more room for
>   another stacked ruleset (i.e. domain).
> 
> Changes since v22:
> * Replace security_capable() with ns_capable_noaudit() (suggested by
>   Jann Horn) and explicitly return EPERM.
> * Fix landlock_enforce_ruleset_current(2)'s out_put_creds (spotted by
>   Jann Horn).
> * Add __always_inline to copy_min_struct_from_user() to make its
>   BUILD_BUG_ON() checks reliable (suggested by Jann Horn).
> * Simplify path assignation in get_path_from_fd() (suggested by Jann
>   Horn).
> * Fix spelling (spotted by Jann Horn).
> 
> Changes since v21:
> * Fix and improve comments.
> 
> Changes since v20:
> * Remove two arguments to landlock_enforce_ruleset(2) (requested by Arnd
>   Bergmann) and rename it to landlock_enforce_ruleset_current(2): remove
>   the enum landlock_target_type and the target file descriptor (not used
>   for now).  A ruleset can only be enforced on the current thread.
> * Remove the size argument in landlock_add_rule() (requested by Arnd
>   Bergmann).
> * Remove landlock_get_features(2) (suggested by Arnd Bergmann).
> * Simplify and rename copy_struct_if_any_from_user() to
>   copy_min_struct_from_user().
> * Rename "options" to "flags"

Re: [PATCH v28 01/12] landlock: Add object management

2021-02-03 Thread Serge E. Hallyn
On Tue, Feb 02, 2021 at 05:26:59PM +0100, Mickaël Salaün wrote:
> From: Mickaël Salaün 
> 
> A Landlock object enables to identify a kernel object (e.g. an inode).
> A Landlock rule is a set of access rights allowed on an object.  Rules
> are grouped in rulesets that may be tied to a set of processes (i.e.
> subjects) to enforce a scoped access-control (i.e. a domain).
> 
> Because Landlock's goal is to empower any process (especially
> unprivileged ones) to sandbox themselves, we cannot rely on a
> system-wide object identification such as file extended attributes.
> Indeed, we need innocuous, composable and modular access-controls.
> 
> The main challenge with these constraints is to identify kernel objects
> while this identification is useful (i.e. when a security policy makes
> use of this object).  But this identification data should be freed once
> no policy is using it.  This ephemeral tagging should not and may not be
> written in the filesystem.  We then need to manage the lifetime of a
> rule according to the lifetime of its objects.  To avoid a global lock,
> this implementation make use of RCU and counters to safely reference
> objects.
> 
> A following commit uses this generic object management for inodes.
> 
> Cc: James Morris 
> Cc: Kees Cook 
> Cc: Serge E. Hallyn 

Acked-by: Serge Hallyn 

Just a few suggestions for the description below.

> Signed-off-by: Mickaël Salaün 
> Reviewed-by: Jann Horn 
> ---
> 
> Changes since v27:
> * Update Kconfig for landlock_restrict_self(2).
> * Cosmetic fixes: use 80 columns in Kconfig and align Makefile
>   declarations.
> 
> Changes since v26:
> * Update Kconfig for landlock_enforce_ruleset_self(2).
> * Fix spelling.
> 
> Changes since v24:
> * Fix typo in comment (spotted by Jann Horn).
> * Add Reviewed-by: Jann Horn 
> 
> Changes since v23:
> * Update landlock_create_object() to return error codes instead of NULL.
>   This help error handling in callers.
> * When using make oldconfig with a previous configuration already
>   including the CONFIG_LSM variable, no question is asked to update its
>   content.  Update the Kconfig help to warn about LSM stacking
>   configuration.
> * Constify variable (spotted by Vincent Dagonneau).
> 
> Changes since v22:
> * Fix spelling (spotted by Jann Horn).
> 
> Changes since v21:
> * Update Kconfig help.
> * Clean up comments.
> 
> Changes since v18:
> * Account objects to kmemcg.
> 
> Changes since v14:
> * Simplify the object, rule and ruleset management at the expense of a
>   less aggressive memory freeing (contributed by Jann Horn, with
>   additional modifications):
>   - Remove object->list aggregating the rules tied to an object.
>   - Remove landlock_get_object(), landlock_drop_object(),
> {get,put}_object_cleaner() and landlock_rule_is_disabled().
>   - Rewrite landlock_put_object() to use a more simple mechanism
> (no tricky RCU).
>   - Replace enum landlock_object_type and landlock_release_object() with
> landlock_object_underops->release()
>   - Adjust unions and Sparse annotations.
>   Cf. 
> https://lore.kernel.org/lkml/cag48ez21ben0wl1bbmtiiu8j9jp5iewthowz4turuj+ki0y...@mail.gmail.com/
> * Merge struct landlock_rule into landlock_ruleset_elem to simplify the
>   rule management.
> * Constify variables.
> * Improve kernel documentation.
> * Cosmetic variable renames.
> * Remove the "default" in the Kconfig (suggested by Jann Horn).
> * Only use refcount_inc() through getter helpers.
> * Update Kconfig description.
> 
> Changes since v13:
> * New dedicated implementation, removing the need for eBPF.
> 
> Previous changes:
> https://lore.kernel.org/lkml/20190721213116.23476-6-...@digikod.net/
> ---
>  MAINTAINERS| 10 +
>  security/Kconfig   |  1 +
>  security/Makefile  |  2 +
>  security/landlock/Kconfig  | 21 +
>  security/landlock/Makefile |  3 ++
>  security/landlock/object.c | 67 
>  security/landlock/object.h | 91 ++
>  7 files changed, 195 insertions(+)
>  create mode 100644 security/landlock/Kconfig
>  create mode 100644 security/landlock/Makefile
>  create mode 100644 security/landlock/object.c
>  create mode 100644 security/landlock/object.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index d3e847f7f3dc..a0e57ade0524 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -9936,6 +9936,16 @@ F: net/core/sock_map.c
>  F:   net/ipv4/tcp_bpf.c
>  F:   net/ipv4/udp_bpf.c
>  
> +LANDLOCK SECURITY MODULE
> +M:   Mickaël Salaün 
> +L:   linux-security-mod...@vger.kernel.org
> +S:   Supported
> +W:   https://landlock

Re: [PATCH 2/2] security.capability: fix conversions on getxattr

2021-01-30 Thread Serge E. Hallyn
On Fri, Jan 29, 2021 at 04:55:29PM -0600, Eric W. Biederman wrote:
> "Serge E. Hallyn"  writes:
> 
> > On Thu, Jan 28, 2021 at 02:19:13PM -0600, Eric W. Biederman wrote:
> >> "Serge E. Hallyn"  writes:
> >> 
> >> > On Tue, Jan 19, 2021 at 07:34:49PM -0600, Eric W. Biederman wrote:
> >> >> Miklos Szeredi  writes:
> >> >> 
> >> >> > If a capability is stored on disk in v2 format 
> >> >> > cap_inode_getsecurity() will
> >> >> > currently return in v2 format unconditionally.
> >> >> >
> >> >> > This is wrong: v2 cap should be equivalent to a v3 cap with zero 
> >> >> > rootid,
> >> >> > and so the same conversions performed on it.
> >> >> >
> >> >> > If the rootid cannot be mapped v3 is returned unconverted.  Fix this 
> >> >> > so
> >> >> > that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of 
> >> >> > the fs
> >> >> > user namespace in case of v2) cannot be mapped in the current user
> >> >> > namespace.
> >> >> 
> >> >> This looks like a good cleanup.
> >> >
> >> > Sorry, I'm not following.  Why is this a good cleanup?  Why should
> >> > the xattr be shown as faked v3 in this case?
> >> 
> >> If the reader is in _user_ns.  If the filesystem was mounted in a
> >> user namespace.   Then the reader looses the information that the
> >
> > Can you be more precise about "filesystem was mounted in a user namespace"?
> > Is this a FUSE thing, the fs is marked as being mounted in a non-init 
> > userns?
> > If that's a possible case, then yes that must be represented as v3.  Using
> > is_v2header() may be the simpler way to check for that, but the more 
> > accurate
> > check would be "is it v2 header and mounted by init_user_ns".
> 
> I think the filesystems current relevant are fuse,overlayfs,ramfs,tmpfs.
> 
> > Basically yes, in as many cases as possible we want to just give a v2
> > cap because more userspace knows what to do with that, but a non-init-userns
> > mounted fs which provides a v2 fscap should have it represented as v3 cap
> > with rootid being the kuid that owns the userns.
> 
> That is the case we that is being fixed in the patch.
> 
> > Or am I still thinking wrongly?  Wouldn't be entirely surprised :)
> 
> No you got it.

So then can we make faking a v3 gated on whether
sb->s_user_ns != _user_ns ?



Re: [PATCH 2/2] security.capability: fix conversions on getxattr

2021-01-29 Thread Serge E. Hallyn
On Fri, Jan 29, 2021 at 05:11:53PM -0600, Eric W. Biederman wrote:
> "Serge E. Hallyn"  writes:
> 
> > On Thu, Jan 28, 2021 at 08:44:26PM +0100, Miklos Szeredi wrote:
> >> On Thu, Jan 28, 2021 at 6:09 PM Serge E. Hallyn  wrote:
> >> >
> >> > On Tue, Jan 19, 2021 at 07:34:49PM -0600, Eric W. Biederman wrote:
> >> > > Miklos Szeredi  writes:
> >> > >
> >> > > > if (!rootid_owns_currentns(kroot)) {
> >> > > > -   kfree(tmpbuf);
> >> > > > -   return -EOPNOTSUPP;
> >> > > > +   size = -EOVERFLOW;
> >> >
> >> > Why this change?  Christian (cc:d) noticed that this is a user visible 
> >> > change.
> >> > Without this change, if you are in a userns which has different rootid, 
> >> > the
> >> > EOVERFLOW tells vfs_getxattr to vall back to __vfs_getxattr() and so you 
> >> > can
> >> > see the v3 capability with its rootid.
> >> >
> >> > With this change, you instead just get EOVERFLOW.
> >> 
> >> Why would the user want to see nonsense (in its own userns) rootid and
> >> what would it do with it?
> >
> > They would know that the data is there.
> 
> But an error of -EOVERFLOW still indicates data is there.
> You just don't get the data because it can not be represented.

Ok - and this happens *after* the check for whether the rootid to maps
into the current ns.

That sounds reasonable, thanks.

> >> Please give an example where an untranslatable rootid would make any
> >> sense at all to the user.
> >
> > I may have accidentally, from init_user_ns, as uid 1000, set an
> > fscap with rootid 11 instead of 10, and wonder why the
> > cap is not working in the container where 10 is root.
> 
> Getting -EOVERFLOW when attempting to read the cap from inside
> the user namespace will immediately tell you what is wrong. The rootid
> does not map.
> 
> That is how all the non-mapping situations are handled.  Either
> -EOVERFLOW or returning INVALID_UID/the unmapped user id aka nobody.
> 
> The existing code is wrong because it returns a completely untranslated
> uid, which is completely non-sense.
> 
> An argument could be made for returning a rootid of 0x aka
> INVALID_UID in a v3 cap xattr when the rootid can not be mapped.  I
> think that is what we do with posix_acls that contain ids that don't
> map.  My sense is returning -EOVERFLOW inside the container and
> returning the v3 cap xattr outside the container will most quickly get
> the problem diagnosed, and will be the most likely to not cause
> problems.
> 
> If there is a good case for returning a v3 cap with rootid of 0x
> instead of -EOVERFLOW we can do that.  Right now I don't see anything
> that would be compelling in either direction.
> 
> Eric
> 
> 
> 


Re: [PATCH 2/2] security.capability: fix conversions on getxattr

2021-01-28 Thread Serge E. Hallyn
On Tue, Jan 19, 2021 at 07:34:49PM -0600, Eric W. Biederman wrote:
> Miklos Szeredi  writes:
> 
> > If a capability is stored on disk in v2 format cap_inode_getsecurity() will
> > currently return in v2 format unconditionally.
> >
> > This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid,
> > and so the same conversions performed on it.
> >
> > If the rootid cannot be mapped v3 is returned unconverted.  Fix this so
> > that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs
> > user namespace in case of v2) cannot be mapped in the current user
> > namespace.
> 
> This looks like a good cleanup.

Sorry, I'm not following.  Why is this a good cleanup?  Why should
the xattr be shown as faked v3 in this case?

A separate question below.

> I do wonder how well this works with stacking.  In particular
> ovl_xattr_set appears to call vfs_getxattr without overriding the creds.
> What the purpose of that is I haven't quite figured out.  It looks like
> it is just a probe to see if an xattr is present so maybe it is ok.
> 
> Acked-by: "Eric W. Biederman" 
> 
> >
> > Signed-off-by: Miklos Szeredi 
> > ---
> >  security/commoncap.c | 67 
> >  1 file changed, 43 insertions(+), 24 deletions(-)
> >
> > diff --git a/security/commoncap.c b/security/commoncap.c
> > index baccd871..c9d99f8f4c82 100644
> > --- a/security/commoncap.c
> > +++ b/security/commoncap.c
> > @@ -371,10 +371,11 @@ int cap_inode_getsecurity(struct inode *inode, const 
> > char *name, void **buffer,
> >  {
> > int size, ret;
> > kuid_t kroot;
> > +   __le32 nsmagic, magic;
> > uid_t root, mappedroot;
> > char *tmpbuf = NULL;
> > struct vfs_cap_data *cap;
> > -   struct vfs_ns_cap_data *nscap;
> > +   struct vfs_ns_cap_data *nscap = NULL;
> > struct dentry *dentry;
> > struct user_namespace *fs_ns;
> >  
> > @@ -396,46 +397,61 @@ int cap_inode_getsecurity(struct inode *inode, const 
> > char *name, void **buffer,
> > fs_ns = inode->i_sb->s_user_ns;
> > cap = (struct vfs_cap_data *) tmpbuf;
> > if (is_v2header((size_t) ret, cap)) {
> > -   /* If this is sizeof(vfs_cap_data) then we're ok with the
> > -* on-disk value, so return that.  */
> > -   if (alloc)
> > -   *buffer = tmpbuf;
> > -   else
> > -   kfree(tmpbuf);
> > -   return ret;
> > -   } else if (!is_v3header((size_t) ret, cap)) {
> > -   kfree(tmpbuf);
> > -   return -EINVAL;
> > +   root = 0;
> > +   } else if (is_v3header((size_t) ret, cap)) {
> > +   nscap = (struct vfs_ns_cap_data *) tmpbuf;
> > +   root = le32_to_cpu(nscap->rootid);
> > +   } else {
> > +   size = -EINVAL;
> > +   goto out_free;
> > }
> >  
> > -   nscap = (struct vfs_ns_cap_data *) tmpbuf;
> > -   root = le32_to_cpu(nscap->rootid);
> > kroot = make_kuid(fs_ns, root);
> >  
> > /* If the root kuid maps to a valid uid in current ns, then return
> >  * this as a nscap. */
> > mappedroot = from_kuid(current_user_ns(), kroot);
> > if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
> > +   size = sizeof(struct vfs_ns_cap_data);
> > if (alloc) {
> > -   *buffer = tmpbuf;
> > +   if (!nscap) {
> > +   /* v2 -> v3 conversion */
> > +   nscap = kzalloc(size, GFP_ATOMIC);
> > +   if (!nscap) {
> > +   size = -ENOMEM;
> > +   goto out_free;
> > +   }
> > +   nsmagic = VFS_CAP_REVISION_3;
> > +   magic = le32_to_cpu(cap->magic_etc);
> > +   if (magic & VFS_CAP_FLAGS_EFFECTIVE)
> > +   nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
> > +   memcpy(>data, >data, sizeof(__le32) 
> > * 2 * VFS_CAP_U32);
> > +   nscap->magic_etc = cpu_to_le32(nsmagic);
> > +   } else {
> > +   /* use allocated v3 buffer */
> > +   tmpbuf = NULL;
> > +   }
> > nscap->rootid = cpu_to_le32(mappedroot);
> > -   } else
> > -   kfree(tmpbuf);
> > -   return size;
> > +   *buffer = nscap;
> > +   }
> > +   goto out_free;
> > }
> >  
> > if (!rootid_owns_currentns(kroot)) {
> > -   kfree(tmpbuf);
> > -   return -EOPNOTSUPP;
> > +   size = -EOVERFLOW;

Why this change?  Christian (cc:d) noticed that this is a user visible change.
Without this change, if you are in a userns which has different rootid, the
EOVERFLOW tells vfs_getxattr to vall back to __vfs_getxattr() and so you can
see the v3 capability with its rootid.

With this change, you 

Re: [PATCH] fix namespaced fscaps when !CONFIG_SECURITY

2020-12-05 Thread Serge E. Hallyn
Oh, I see you'd changed it inline :)  Thanks

On Sat, Dec 05, 2020 at 11:40:00AM -0600, Serge E. Hallyn wrote:
> How odd - where did that come from?
> 
> James, I force-pushed that with corrected bugzilla link to
> 2020-11-29/fix-nscaps.  Sorry about that.
> 
> On Fri, Dec 04, 2020 at 07:58:14AM -0800, Andrew G. Morgan wrote:
> > The correct bug reference for this patch is:
> > 
> > https://bugzilla.kernel.org/show_bug.cgi?id=209689
> > 
> > Reviewed-by: Andrew G. Morgan 
> > 
> > On Mon, Nov 30, 2020 at 6:58 PM James Morris  wrote:
> > >
> > > On Sun, 29 Nov 2020, Serge E. Hallyn wrote:
> > >
> > > > Hi James,
> > > >
> > > > would you mind adding this to the security tree?  (You can cherrypick
> > > > from 
> > > > https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/commit/?h=2020-11-29/fix-nscaps
> > > >  )
> > >
> > > Sure.
> > >
> > > >
> > > > thanks,
> > > > -serge
> > > >
> > > > On Tue, Nov 17, 2020 at 08:09:59AM -0800, Andrew G. Morgan wrote:
> > > > > Signed-off-by: Andrew G. Morgan 
> > > > >
> > > > >
> > > > > On Tue, Nov 17, 2020 at 7:09 AM Serge E. Hallyn  
> > > > > wrote:
> > > > >
> > > > > > Namespaced file capabilities were introduced in 8db6c34f1dbc .
> > > > > > When userspace reads an xattr for a namespaced capability, a
> > > > > > virtualized representation of it is returned if the caller is
> > > > > > in a user namespace owned by the capability's owning rootid.
> > > > > > The function which performs this virtualization was not hooked
> > > > > > up if CONFIG_SECURITY=n.  Therefore in that case the original
> > > > > > xattr was shown instead of the virtualized one.
> > > > > >
> > > > > > To test this using libcap-bin (*1),
> > > > > >
> > > > > > $ v=$(mktemp)
> > > > > > $ unshare -Ur setcap cap_sys_admin-eip $v
> > > > > > $ unshare -Ur setcap -v cap_sys_admin-eip $v
> > > > > > /tmp/tmp.lSiIFRvt8Y: OK
> > > > > >
> > > > > > "setcap -v" verifies the values instead of setting them, and
> > > > > > will check whether the rootid value is set.  Therefore, with
> > > > > > this bug un-fixed, and with CONFIG_SECURITY=n, setcap -v will
> > > > > > fail:
> > > > > >
> > > > > > $ v=$(mktemp)
> > > > > > $ unshare -Ur setcap cap_sys_admin=eip $v
> > > > > > $ unshare -Ur setcap -v cap_sys_admin=eip $v
> > > > > > nsowner[got=1000, want=0],/tmp/tmp.HHDiOOl9fY differs in []
> > > > > >
> > > > > > Fix this bug by calling cap_inode_getsecurity() in
> > > > > > security_inode_getsecurity() instead of returning
> > > > > > -EOPNOTSUPP, when CONFIG_SECURITY=n.
> > > > > >
> > > > > > *1 - note, if libcap is too old for getcap to have the '-n'
> > > > > > option, then use verify-caps instead.
> > > > > >
> > > > > > Signed-off-by: Serge Hallyn 
> > > > > > Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1593431
> > > > > > Cc: Hervé Guillemet 
> > > > > > Cc: Andrew G. Morgan 
> > > > > > Cc: Casey Schaufler 
> > > > > > ---
> > > > > >  include/linux/security.h | 2 +-
> > > > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > > > >
> > > > > > diff --git a/include/linux/security.h b/include/linux/security.h
> > > > > > index bc2725491560..39642626a707 100644
> > > > > > --- a/include/linux/security.h
> > > > > > +++ b/include/linux/security.h
> > > > > > @@ -869,7 +869,7 @@ static inline int security_inode_killpriv(struct
> > > > > > dentry *dentry)
> > > > > >
> > > > > >  static inline int security_inode_getsecurity(struct inode *inode, 
> > > > > > const
> > > > > > char *name, void **buffer, bool alloc)
> > > > > >  {
> > > > > > -   return -EOPNOTSUPP;
> > > > > > +   return cap_inode_getsecurity(inode, name, buffer, alloc);
> > > > > >  }
> > > > > >
> > > > > >  static inline int security_inode_setsecurity(struct inode *inode, 
> > > > > > const
> > > > > > char *name, const void *value, size_t size, int flags)
> > > > > > --
> > > > > > 2.25.1
> > > > > >
> > > > > >
> > > >
> > >
> > > --
> > > James Morris
> > > 


Re: [PATCH] fix namespaced fscaps when !CONFIG_SECURITY

2020-12-05 Thread Serge E. Hallyn
How odd - where did that come from?

James, I force-pushed that with corrected bugzilla link to
2020-11-29/fix-nscaps.  Sorry about that.

On Fri, Dec 04, 2020 at 07:58:14AM -0800, Andrew G. Morgan wrote:
> The correct bug reference for this patch is:
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=209689
> 
> Reviewed-by: Andrew G. Morgan 
> 
> On Mon, Nov 30, 2020 at 6:58 PM James Morris  wrote:
> >
> > On Sun, 29 Nov 2020, Serge E. Hallyn wrote:
> >
> > > Hi James,
> > >
> > > would you mind adding this to the security tree?  (You can cherrypick
> > > from 
> > > https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/commit/?h=2020-11-29/fix-nscaps
> > >  )
> >
> > Sure.
> >
> > >
> > > thanks,
> > > -serge
> > >
> > > On Tue, Nov 17, 2020 at 08:09:59AM -0800, Andrew G. Morgan wrote:
> > > > Signed-off-by: Andrew G. Morgan 
> > > >
> > > >
> > > > On Tue, Nov 17, 2020 at 7:09 AM Serge E. Hallyn  
> > > > wrote:
> > > >
> > > > > Namespaced file capabilities were introduced in 8db6c34f1dbc .
> > > > > When userspace reads an xattr for a namespaced capability, a
> > > > > virtualized representation of it is returned if the caller is
> > > > > in a user namespace owned by the capability's owning rootid.
> > > > > The function which performs this virtualization was not hooked
> > > > > up if CONFIG_SECURITY=n.  Therefore in that case the original
> > > > > xattr was shown instead of the virtualized one.
> > > > >
> > > > > To test this using libcap-bin (*1),
> > > > >
> > > > > $ v=$(mktemp)
> > > > > $ unshare -Ur setcap cap_sys_admin-eip $v
> > > > > $ unshare -Ur setcap -v cap_sys_admin-eip $v
> > > > > /tmp/tmp.lSiIFRvt8Y: OK
> > > > >
> > > > > "setcap -v" verifies the values instead of setting them, and
> > > > > will check whether the rootid value is set.  Therefore, with
> > > > > this bug un-fixed, and with CONFIG_SECURITY=n, setcap -v will
> > > > > fail:
> > > > >
> > > > > $ v=$(mktemp)
> > > > > $ unshare -Ur setcap cap_sys_admin=eip $v
> > > > > $ unshare -Ur setcap -v cap_sys_admin=eip $v
> > > > > nsowner[got=1000, want=0],/tmp/tmp.HHDiOOl9fY differs in []
> > > > >
> > > > > Fix this bug by calling cap_inode_getsecurity() in
> > > > > security_inode_getsecurity() instead of returning
> > > > > -EOPNOTSUPP, when CONFIG_SECURITY=n.
> > > > >
> > > > > *1 - note, if libcap is too old for getcap to have the '-n'
> > > > > option, then use verify-caps instead.
> > > > >
> > > > > Signed-off-by: Serge Hallyn 
> > > > > Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1593431
> > > > > Cc: Hervé Guillemet 
> > > > > Cc: Andrew G. Morgan 
> > > > > Cc: Casey Schaufler 
> > > > > ---
> > > > >  include/linux/security.h | 2 +-
> > > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/include/linux/security.h b/include/linux/security.h
> > > > > index bc2725491560..39642626a707 100644
> > > > > --- a/include/linux/security.h
> > > > > +++ b/include/linux/security.h
> > > > > @@ -869,7 +869,7 @@ static inline int security_inode_killpriv(struct
> > > > > dentry *dentry)
> > > > >
> > > > >  static inline int security_inode_getsecurity(struct inode *inode, 
> > > > > const
> > > > > char *name, void **buffer, bool alloc)
> > > > >  {
> > > > > -   return -EOPNOTSUPP;
> > > > > +   return cap_inode_getsecurity(inode, name, buffer, alloc);
> > > > >  }
> > > > >
> > > > >  static inline int security_inode_setsecurity(struct inode *inode, 
> > > > > const
> > > > > char *name, const void *value, size_t size, int flags)
> > > > > --
> > > > > 2.25.1
> > > > >
> > > > >
> > >
> >
> > --
> > James Morris
> > 


Re: [PATCH] fix namespaced fscaps when !CONFIG_SECURITY

2020-11-29 Thread Serge E. Hallyn
Hi James,

would you mind adding this to the security tree?  (You can cherrypick
from 
https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/commit/?h=2020-11-29/fix-nscaps
 )

thanks,
-serge

On Tue, Nov 17, 2020 at 08:09:59AM -0800, Andrew G. Morgan wrote:
> Signed-off-by: Andrew G. Morgan 
> 
> 
> On Tue, Nov 17, 2020 at 7:09 AM Serge E. Hallyn  wrote:
> 
> > Namespaced file capabilities were introduced in 8db6c34f1dbc .
> > When userspace reads an xattr for a namespaced capability, a
> > virtualized representation of it is returned if the caller is
> > in a user namespace owned by the capability's owning rootid.
> > The function which performs this virtualization was not hooked
> > up if CONFIG_SECURITY=n.  Therefore in that case the original
> > xattr was shown instead of the virtualized one.
> >
> > To test this using libcap-bin (*1),
> >
> > $ v=$(mktemp)
> > $ unshare -Ur setcap cap_sys_admin-eip $v
> > $ unshare -Ur setcap -v cap_sys_admin-eip $v
> > /tmp/tmp.lSiIFRvt8Y: OK
> >
> > "setcap -v" verifies the values instead of setting them, and
> > will check whether the rootid value is set.  Therefore, with
> > this bug un-fixed, and with CONFIG_SECURITY=n, setcap -v will
> > fail:
> >
> > $ v=$(mktemp)
> > $ unshare -Ur setcap cap_sys_admin=eip $v
> > $ unshare -Ur setcap -v cap_sys_admin=eip $v
> > nsowner[got=1000, want=0],/tmp/tmp.HHDiOOl9fY differs in []
> >
> > Fix this bug by calling cap_inode_getsecurity() in
> > security_inode_getsecurity() instead of returning
> > -EOPNOTSUPP, when CONFIG_SECURITY=n.
> >
> > *1 - note, if libcap is too old for getcap to have the '-n'
> > option, then use verify-caps instead.
> >
> > Signed-off-by: Serge Hallyn 
> > Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1593431
> > Cc: Hervé Guillemet 
> > Cc: Andrew G. Morgan 
> > Cc: Casey Schaufler 
> > ---
> >  include/linux/security.h | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/include/linux/security.h b/include/linux/security.h
> > index bc2725491560..39642626a707 100644
> > --- a/include/linux/security.h
> > +++ b/include/linux/security.h
> > @@ -869,7 +869,7 @@ static inline int security_inode_killpriv(struct
> > dentry *dentry)
> >
> >  static inline int security_inode_getsecurity(struct inode *inode, const
> > char *name, void **buffer, bool alloc)
> >  {
> > -   return -EOPNOTSUPP;
> > +   return cap_inode_getsecurity(inode, name, buffer, alloc);
> >  }
> >
> >  static inline int security_inode_setsecurity(struct inode *inode, const
> > char *name, const void *value, size_t size, int flags)
> > --
> > 2.25.1
> >
> >


[PATCH] fix namespaced fscaps when !CONFIG_SECURITY

2020-11-17 Thread Serge E. Hallyn
Namespaced file capabilities were introduced in 8db6c34f1dbc .
When userspace reads an xattr for a namespaced capability, a
virtualized representation of it is returned if the caller is
in a user namespace owned by the capability's owning rootid.
The function which performs this virtualization was not hooked
up if CONFIG_SECURITY=n.  Therefore in that case the original
xattr was shown instead of the virtualized one.

To test this using libcap-bin (*1),

$ v=$(mktemp)
$ unshare -Ur setcap cap_sys_admin-eip $v
$ unshare -Ur setcap -v cap_sys_admin-eip $v
/tmp/tmp.lSiIFRvt8Y: OK

"setcap -v" verifies the values instead of setting them, and
will check whether the rootid value is set.  Therefore, with
this bug un-fixed, and with CONFIG_SECURITY=n, setcap -v will
fail:

$ v=$(mktemp)
$ unshare -Ur setcap cap_sys_admin=eip $v
$ unshare -Ur setcap -v cap_sys_admin=eip $v
nsowner[got=1000, want=0],/tmp/tmp.HHDiOOl9fY differs in []

Fix this bug by calling cap_inode_getsecurity() in
security_inode_getsecurity() instead of returning
-EOPNOTSUPP, when CONFIG_SECURITY=n.

*1 - note, if libcap is too old for getcap to have the '-n'
option, then use verify-caps instead.

Signed-off-by: Serge Hallyn 
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1593431
Cc: Hervé Guillemet 
Cc: Andrew G. Morgan 
Cc: Casey Schaufler 
---
 include/linux/security.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/security.h b/include/linux/security.h
index bc2725491560..39642626a707 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -869,7 +869,7 @@ static inline int security_inode_killpriv(struct dentry 
*dentry)
 
 static inline int security_inode_getsecurity(struct inode *inode, const char 
*name, void **buffer, bool alloc)
 {
-   return -EOPNOTSUPP;
+   return cap_inode_getsecurity(inode, name, buffer, alloc);
 }
 
 static inline int security_inode_setsecurity(struct inode *inode, const char 
*name, const void *value, size_t size, int flags)
-- 
2.25.1



Re: [RFC PATCH 1/1] security: add fault injection capability

2020-10-15 Thread Serge E. Hallyn
On Thu, Oct 15, 2020 at 10:46:49AM +, Aleksandr Nogikh wrote:
> From: Aleksandr Nogikh 
> 
> Add a fault injection capability to call_int_hook macro. This will
> facilitate testing of fault tolerance of the code that invokes
> security hooks as well as the fault tolerance of the LSM
> implementations themselves.
> 
> Add a KConfig option (CONFIG_FAIL_LSM_HOOKS) that controls whether the
> capability is enabled. In order to enable configuration from the user
> space, add the standard debugfs entries for fault injection (if
> CONFIG_FAULT_INJECTION_DEBUG_FS is enabled).
> 
> Signed-off-by: Aleksandr Nogikh 
> ---
>  lib/Kconfig.debug   |  6 +
>  security/security.c | 53 ++---
>  2 files changed, 56 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 08c82666cf3e..0c9913ebe1c1 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -1803,6 +1803,12 @@ config FAIL_MAKE_REQUEST
>   help
> Provide fault-injection capability for disk IO.
>  
> +config FAIL_LSM_HOOKS
> + bool "Fault-injection capability for LSM hooks"
> + depends on FAULT_INJECTION
> + help
> +   Provide fault-injection capability for LSM hooks.
> +
>  config FAIL_IO_TIMEOUT
>   bool "Fault-injection capability for faking disk interrupts"
>   depends on FAULT_INJECTION && BLOCK
> diff --git a/security/security.c b/security/security.c
> index 69ff6e2e2cd4..bd4dbe720098 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -28,6 +28,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #define MAX_LSM_EVM_XATTR2
> @@ -669,6 +670,51 @@ static void __init lsm_early_task(struct task_struct 
> *task)
>   panic("%s: Early task alloc failed.\n", __func__);
>  }
>  
> +
> +#ifdef CONFIG_FAIL_LSM_HOOKS
> +
> +static struct {
> + struct fault_attr attr;
> + int retval;
> +} fail_lsm_hooks = {
> + .attr = FAULT_ATTR_INITIALIZER,
> + .retval = -EACCES
> +};
> +
> +static int __init setup_fail_lsm_hooks(char *str)
> +{
> + return setup_fault_attr(_lsm_hooks.attr, str);
> +}
> +__setup("fail_lsm_hooks=", setup_fail_lsm_hooks);
> +
> +static int should_fail_lsm_hook(void)
> +{
> + return should_fail(_lsm_hooks.attr, 1) ? fail_lsm_hooks.retval : 0;
> +}
> +
> +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
> +
> +static int __init fail_lsm_hooks_debugfs(void)
> +{
> + umode_t mode = S_IFREG | 0600;
> + struct dentry *dir;
> +
> + dir = fault_create_debugfs_attr("fail_lsm_hooks", NULL,
> + _lsm_hooks.attr);
> + debugfs_create_u32("retval", mode, dir, _lsm_hooks.retval);
> + return 0;
> +}
> +
> +late_initcall(fail_lsm_hooks_debugfs);
> +
> +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
> +
> +#else
> +
> +static inline int should_fail_lsm_hook(void) { return 0; }
> +
> +#endif /* CONFIG_FAIL_LSM_HOOKS */
> +
>  /*
>   * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
>   * can be accessed with:
> @@ -707,16 +753,17 @@ static void __init lsm_early_task(struct task_struct 
> *task)
>   } while (0)
>  
>  #define call_int_hook(FUNC, IRC, ...) ({ \
> - int RC = IRC;   \
> - do {\
> + int RC = should_fail_lsm_hook();\

seeing 'should_fail' here, kind of out of context, would be confusing to
thousands of ppl reading the code and wondering why it should fail.  maybe
"inject_fail_lsm_hook()" ?

> + if (RC == 0) {  
> \
>   struct security_hook_list *P;   \
> + RC = IRC;   
> \
>   \
>   hlist_for_each_entry(P, _hook_heads.FUNC, list) { \
>   RC = P->hook.FUNC(__VA_ARGS__); \
>   if (RC != 0)\
>   break;  \
>   }   \
> - } while (0);\
> + }   \
>   RC; \
>  })
>  
> -- 
> 2.28.0.1011.ga647a8990f-goog


Re: LPC 2020 Hackroom Session: summary and next steps for isolated user namespaces

2020-10-15 Thread Serge E. Hallyn
On Tue, Oct 13, 2020 at 05:17:36PM +0200, Giuseppe Scrivano wrote:
> "Serge E. Hallyn"  writes:
> 
> > On Mon, Oct 12, 2020 at 07:05:10PM +0200, Giuseppe Scrivano wrote:
> >> Josh Triplett  writes:
> >> 
> >> > On Fri, Oct 09, 2020 at 11:26:06PM -0500, Serge E. Hallyn wrote:
> >> >> > 3. Find a way to allow setgroups() in a user namespace while keeping
> >> >> >in mind the case of groups used for negative access control.
> >> >> >This was suggested by Josh Triplett and Geoffrey Thomas. Their 
> >> >> > idea was to
> >> >> >investigate adding a prctl() to allow setgroups() to be called in 
> >> >> > a user
> >> >> >namespace at the cost of restricting paths to the most restrictive
> >> >> >permission. So if something is 0707 it needs to be treated as if 
> >> >> > it's 
> >> >> >even though the caller is not in its owning group which is used 
> >> >> > for negative
> >> >> >access control (how these new semantics will interact with ACLs 
> >> >> > will also
> >> >> >need to be looked into).
> >> >> 
> >> >> I should probably think this through more, but for this problem, would 
> >> >> it
> >> >> not suffice to add a new prevgroups grouplist to the struct cred, maybe
> >> >> struct group_info *locked_groups, and every time an unprivileged task 
> >> >> creates
> >> >> a new user namespace, add all its current groups to this list?
> >> >
> >> > So, effectively, you would be allowed to drop permissions, but
> >> > locked_groups would still be checked for restrictions?
> >> >
> >> > That seems like it'd introduce a new level of complexity (a new facet of
> >> > permission) to manage. Not opposed, but it does seem more complex than
> >> > just opting out of using groups for negative permissions.
> >> 
> >> I have played with something similar in the past.  At that time I've
> >> discussed it only privately with Eric and we agreed it wasn't worth the
> >> extra complexity:
> >> 
> >> https://github.com/giuseppe/linux/commit/7e0701b389c497472d11fab8570c153a414050af
> >
> > Hi, you linked the setgroups patch, but do you also have a link to the
> > attempt which you deemed was not worth it?
> 
> it was just part of a private discussion; but was 4 years ago so we can
> probably revisit and accept the additional complexity since setgroups()
> is still an issue with user namespaces.
> 
> 
> >> instead of a prctl, I've added a new mode to /proc/PID/setgroups that
> >> allows setgroups in a userns locking the current gids.
> >> 
> >> What do you think about using /proc/PID/setgroups instead of a new
> >> prctl()?
> >
> > It's better than not having it, but two concerns -
> >
> > 1. some userspace, especially testsuites, could become confused by the fact
> > that they can't drop groups no matter how hard they try, since these will 
> > all
> > still show up as regular groups.
> 
> I forgot to send a link to a second patch :-) that completes the feature:
> https://github.com/giuseppe/linux/commit/1c5fe726346b216293a527719e64f34e6297f0c2
> 
> When the new mode is used, the gids that are not known in the userns do
> not show up in userspace.

Ah, right - and of course those gids better not be mapped into the namespace :)

But so, this is the patch you said you agreed was not worth the extra
complexity?

> > 2. whereas in my lockgroups proposal, lock_groups would only be taken into 
> > account
> > for permission denial, this proposal would count for permission grants too. 
> >  This
> > means that if I have a group which is permitted to read /foo/topsecret, and 
> > I
> > start a program in a new user namespace expecting it to drop that 
> > permission,
> > I can't have that, right?  The new program, will always have that 
> > permission?
> 
> right.  The new mode I was working on cannot be used to drop granted 
> permissions.
> 
> Giuseppe


Re: LPC 2020 Hackroom Session: summary and next steps for isolated user namespaces

2020-10-15 Thread Serge E. Hallyn
On Wed, Oct 14, 2020 at 02:46:46PM -0500, Eric W. Biederman wrote:
> "Serge E. Hallyn"  writes:
> 
> > On Mon, Oct 12, 2020 at 12:01:09AM -0500, Eric W. Biederman wrote:
> >> Andy Lutomirski  writes:
> >> 
> >> > On Sun, Oct 11, 2020 at 1:53 PM Josh Triplett  
> >> > wrote:
> >> >>
> >> >> On Fri, Oct 09, 2020 at 11:26:06PM -0500, Serge E. Hallyn wrote:
> >> >> > > 3. Find a way to allow setgroups() in a user namespace while keeping
> >> >> > >in mind the case of groups used for negative access control.
> >> >> > >This was suggested by Josh Triplett and Geoffrey Thomas. Their 
> >> >> > > idea was to
> >> >> > >investigate adding a prctl() to allow setgroups() to be called 
> >> >> > > in a user
> >> >> > >namespace at the cost of restricting paths to the most 
> >> >> > > restrictive
> >> >> > >permission. So if something is 0707 it needs to be treated as if 
> >> >> > > it's 
> >> >> > >even though the caller is not in its owning group which is used 
> >> >> > > for negative
> >> >> > >access control (how these new semantics will interact with ACLs 
> >> >> > > will also
> >> >> > >need to be looked into).
> >> >> >
> >> >> > I should probably think this through more, but for this problem, 
> >> >> > would it
> >> >> > not suffice to add a new prevgroups grouplist to the struct cred, 
> >> >> > maybe
> >> >> > struct group_info *locked_groups, and every time an unprivileged task 
> >> >> > creates
> >> >> > a new user namespace, add all its current groups to this list?
> >> >>
> >> >> So, effectively, you would be allowed to drop permissions, but
> >> >> locked_groups would still be checked for restrictions?
> >> >>
> >> >> That seems like it'd introduce a new level of complexity (a new facet of
> >> >> permission) to manage. Not opposed, but it does seem more complex than
> >> >> just opting out of using groups for negative permissions.
> >
> > Yeah, it would, but I basically hoped that we could catch most of this at
> > e.g. generic_permission(), and/or we could introduce a helper which
> > automatically adds a check for permission denied from locked_groups, so
> > it shouldn't be too wide-spread.  If it does end up showing up all over
> > the place, then that's a good reason not to do this.
> >
> >> > Is there any context other than regular UNIX DAC in which groups can
> >> > act as negative permissions or is this literally just an issue for
> >> > files with a more restrictive group mode than other mode?
> >> 
> >> Just that.
> >> 
> >> The ideas kicked around in the conversation were some variant of having
> >> a sysctl that says "This system never uses groups for negative
> >> permissions".
> >> 
> >> It was also suggested that if the sysctl was set the the permission
> >> checks would be altered such that even if someone tried to set a
> >> negative permission, the more liberal permissions of other would be used
> >> instead.
> >
> > So then this would touch all the same code points which the
> > locked_groups approach would have to touch?
> 
> No locked_groups would touch in_group_p and set_groups.  Especially what
> set_groups means in that context.  It would have to handle what happens
> when you start accumulating locked groups (because of multiple
> namespaces).  How you dedup locked groups etc.

Well since group_info is sorted, you should be able to do a pretty
simple and quick merge of current->locked_groups and
current->group_info.  I suppose we'd have to consider a nasty user who
is allocated 100k groups, sticks them all in groupinfo, then unshare
twice, locking the kernel up for awhile, but that user can already hurt
us.

> I was not able to convince myself that not being able to clear out
> groups that a user has when they create a user namespace won't cause
> other problems.  Especially as user namespaces had been in use for a
> while at that point.

The locked_groups would *only* be considered for negative acls, right?
You would not *grant* any perms based on them.  It seems like exactly
what you want.  If any user is denied perms on account of it, then that
was the 

Re: LPC 2020 Hackroom Session: summary and next steps for isolated user namespaces

2020-10-13 Thread Serge E. Hallyn
On Mon, Oct 12, 2020 at 07:05:10PM +0200, Giuseppe Scrivano wrote:
> Josh Triplett  writes:
> 
> > On Fri, Oct 09, 2020 at 11:26:06PM -0500, Serge E. Hallyn wrote:
> >> > 3. Find a way to allow setgroups() in a user namespace while keeping
> >> >in mind the case of groups used for negative access control.
> >> >This was suggested by Josh Triplett and Geoffrey Thomas. Their idea 
> >> > was to
> >> >investigate adding a prctl() to allow setgroups() to be called in a 
> >> > user
> >> >namespace at the cost of restricting paths to the most restrictive
> >> >permission. So if something is 0707 it needs to be treated as if it's 
> >> > 
> >> >even though the caller is not in its owning group which is used for 
> >> > negative
> >> >access control (how these new semantics will interact with ACLs will 
> >> > also
> >> >need to be looked into).
> >> 
> >> I should probably think this through more, but for this problem, would it
> >> not suffice to add a new prevgroups grouplist to the struct cred, maybe
> >> struct group_info *locked_groups, and every time an unprivileged task 
> >> creates
> >> a new user namespace, add all its current groups to this list?
> >
> > So, effectively, you would be allowed to drop permissions, but
> > locked_groups would still be checked for restrictions?
> >
> > That seems like it'd introduce a new level of complexity (a new facet of
> > permission) to manage. Not opposed, but it does seem more complex than
> > just opting out of using groups for negative permissions.
> 
> I have played with something similar in the past.  At that time I've
> discussed it only privately with Eric and we agreed it wasn't worth the
> extra complexity:
> 
> https://github.com/giuseppe/linux/commit/7e0701b389c497472d11fab8570c153a414050af

Hi, you linked the setgroups patch, but do you also have a link to the
attempt which you deemed was not worth it?

> instead of a prctl, I've added a new mode to /proc/PID/setgroups that
> allows setgroups in a userns locking the current gids.
> 
> What do you think about using /proc/PID/setgroups instead of a new
> prctl()?

It's better than not having it, but two concerns -

1. some userspace, especially testsuites, could become confused by the fact
that they can't drop groups no matter how hard they try, since these will all
still show up as regular groups.
2. whereas in my lockgroups proposal, lock_groups would only be taken into 
account
for permission denial, this proposal would count for permission grants too.  
This
means that if I have a group which is permitted to read /foo/topsecret, and I
start a program in a new user namespace expecting it to drop that permission,
I can't have that, right?  The new program, will always have that permission?


Re: LPC 2020 Hackroom Session: summary and next steps for isolated user namespaces

2020-10-12 Thread Serge E. Hallyn
On Mon, Oct 12, 2020 at 12:01:09AM -0500, Eric W. Biederman wrote:
> Andy Lutomirski  writes:
> 
> > On Sun, Oct 11, 2020 at 1:53 PM Josh Triplett  wrote:
> >>
> >> On Fri, Oct 09, 2020 at 11:26:06PM -0500, Serge E. Hallyn wrote:
> >> > > 3. Find a way to allow setgroups() in a user namespace while keeping
> >> > >in mind the case of groups used for negative access control.
> >> > >This was suggested by Josh Triplett and Geoffrey Thomas. Their idea 
> >> > > was to
> >> > >investigate adding a prctl() to allow setgroups() to be called in a 
> >> > > user
> >> > >namespace at the cost of restricting paths to the most restrictive
> >> > >permission. So if something is 0707 it needs to be treated as if 
> >> > > it's 
> >> > >even though the caller is not in its owning group which is used for 
> >> > > negative
> >> > >access control (how these new semantics will interact with ACLs 
> >> > > will also
> >> > >need to be looked into).
> >> >
> >> > I should probably think this through more, but for this problem, would it
> >> > not suffice to add a new prevgroups grouplist to the struct cred, maybe
> >> > struct group_info *locked_groups, and every time an unprivileged task 
> >> > creates
> >> > a new user namespace, add all its current groups to this list?
> >>
> >> So, effectively, you would be allowed to drop permissions, but
> >> locked_groups would still be checked for restrictions?
> >>
> >> That seems like it'd introduce a new level of complexity (a new facet of
> >> permission) to manage. Not opposed, but it does seem more complex than
> >> just opting out of using groups for negative permissions.

Yeah, it would, but I basically hoped that we could catch most of this at
e.g. generic_permission(), and/or we could introduce a helper which
automatically adds a check for permission denied from locked_groups, so
it shouldn't be too wide-spread.  If it does end up showing up all over
the place, then that's a good reason not to do this.

> > Is there any context other than regular UNIX DAC in which groups can
> > act as negative permissions or is this literally just an issue for
> > files with a more restrictive group mode than other mode?
> 
> Just that.
> 
> The ideas kicked around in the conversation were some variant of having
> a sysctl that says "This system never uses groups for negative
> permissions".
> 
> It was also suggested that if the sysctl was set the the permission
> checks would be altered such that even if someone tried to set a
> negative permission, the more liberal permissions of other would be used
> instead.

So then this would touch all the same code points which the
locked_groups approach would have to touch?

> Given that creating /etc/subgid is effectively opting out of negative
> permissions already have a sysctl that says that upfront feels like a
> very clean solution.
> 
> Eric

That feels like a cop-out to me.  If some young admin at Roxxon Corp decides
she needs to run a container, so installs subuid package and sets that sysctl,
how does she know whether or not some previous admin, who has since retired and
did not keep good docs, set things up so that a negative acl is keeping nginx
from reading some supersecret doc?

Now personally I'm not a great believer in the negative acls so I think the
above is a very unlikely scenario, but if we're going to worry about it, then
we should worry about it :)

"Click this button if noone has ever used feature X on this server"

-serge


Re: LPC 2020 Hackroom Session: summary and next steps for isolated user namespaces

2020-10-09 Thread Serge E. Hallyn
> 3. Find a way to allow setgroups() in a user namespace while keeping
>in mind the case of groups used for negative access control.
>This was suggested by Josh Triplett and Geoffrey Thomas. Their idea was to
>investigate adding a prctl() to allow setgroups() to be called in a user
>namespace at the cost of restricting paths to the most restrictive
>permission. So if something is 0707 it needs to be treated as if it's 
>even though the caller is not in its owning group which is used for 
> negative
>access control (how these new semantics will interact with ACLs will also
>need to be looked into).

I should probably think this through more, but for this problem, would it
not suffice to add a new prevgroups grouplist to the struct cred, maybe
struct group_info *locked_groups, and every time an unprivileged task creates
a new user namespace, add all its current groups to this list?


Re: [PATCH v2] block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE

2020-08-24 Thread Serge E. Hallyn
On Mon, Aug 24, 2020 at 03:10:34PM -0700, Khazhismel Kumykov wrote:
> CAP_SYS_ADMIN is too broad, and ionice fits into CAP_SYS_NICE's grouping.
> 
> Retain CAP_SYS_ADMIN permission for backwards compatibility.
> 
> Signed-off-by: Khazhismel Kumykov 

Acked-by: Serge Hallyn 

> ---
>  block/ioprio.c  | 2 +-
>  include/uapi/linux/capability.h | 2 ++
>  2 files changed, 3 insertions(+), 1 deletion(-)
> 
> v2: fix embarrassing logic mistake
> diff --git a/block/ioprio.c b/block/ioprio.c
> index 77bcab11dce5..276496246fe9 100644
> --- a/block/ioprio.c
> +++ b/block/ioprio.c
> @@ -69,7 +69,7 @@ int ioprio_check_cap(int ioprio)
>  
>   switch (class) {
>   case IOPRIO_CLASS_RT:
> - if (!capable(CAP_SYS_ADMIN))
> + if (!capable(CAP_SYS_NICE) && !capable(CAP_SYS_ADMIN))
>   return -EPERM;
>   /* fall through */
>   /* rt has prio field too */
> diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
> index 395dd0df8d08..c6ca33034147 100644
> --- a/include/uapi/linux/capability.h
> +++ b/include/uapi/linux/capability.h
> @@ -288,6 +288,8 @@ struct vfs_ns_cap_data {
> processes and setting the scheduling algorithm used by another
> process. */
>  /* Allow setting cpu affinity on other processes */
> +/* Allow setting realtime ioprio class */
> +/* Allow setting ioprio class on other processes */
>  
>  #define CAP_SYS_NICE 23
>  
> -- 
> 2.28.0.297.g1956fa8f8d-goog


Re: [PATCH v6 6/7] prctl: exe link permission error changed from -EINVAL to -EPERM

2020-07-19 Thread Serge E. Hallyn
On Sun, Jul 19, 2020 at 12:04:16PM +0200, Adrian Reber wrote:
> From: Nicolas Viennot 
> 
> This brings consistency with the rest of the prctl() syscall where
> -EPERM is returned when failing a capability check.
> 
> Signed-off-by: Nicolas Viennot 
> Signed-off-by: Adrian Reber 

Ok, i see how EINVAL snuck its way in there through validate_prctl_map()s
evolution :)

Reviewed-by: Serge Hallyn 

> ---
>  kernel/sys.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/sys.c b/kernel/sys.c
> index a3f4ef0bbda3..ca11af9d815d 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2015,7 +2015,7 @@ static int prctl_set_mm_map(int opt, const void __user 
> *addr, unsigned long data
>* This may have implications in the tomoyo subsystem.
>*/
>   if (!checkpoint_restore_ns_capable(current_user_ns()))
> - return -EINVAL;
> + return -EPERM;
>  
>   error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
>   if (error)
> -- 
> 2.26.2


Re: [PATCH v6 4/7] proc: allow access in init userns for map_files with CAP_CHECKPOINT_RESTORE

2020-07-19 Thread Serge E. Hallyn
On Sun, Jul 19, 2020 at 12:04:14PM +0200, Adrian Reber wrote:
> Opening files in /proc/pid/map_files when the current user is
> CAP_CHECKPOINT_RESTORE capable in the root namespace is useful for
> checkpointing and restoring to recover files that are unreachable via
> the file system such as deleted files, or memfd files.
> 
> Signed-off-by: Adrian Reber 

Reviewed-by: Serge Hallyn 

> Signed-off-by: Nicolas Viennot 
> Reviewed-by: Cyrill Gorcunov 
> ---
>  fs/proc/base.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 65893686d1f1..b824a8c89011 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2194,16 +2194,16 @@ struct map_files_info {
>  };
>  
>  /*
> - * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how 
> the
> - * symlinks may be used to bypass permissions on ancestor directories in the
> - * path to the file in question.
> + * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, 
> due
> + * to concerns about how the symlinks may be used to bypass permissions on
> + * ancestor directories in the path to the file in question.
>   */
>  static const char *
>  proc_map_files_get_link(struct dentry *dentry,
>   struct inode *inode,
>   struct delayed_call *done)
>  {
> - if (!capable(CAP_SYS_ADMIN))
> + if (!checkpoint_restore_ns_capable(_user_ns))
>   return ERR_PTR(-EPERM);
>  
>   return proc_pid_get_link(dentry, inode, done);
> -- 
> 2.26.2


Re: [PATCH v5 0/6] capabilities: Introduce CAP_CHECKPOINT_RESTORE

2020-07-17 Thread Serge E. Hallyn
On Wed, Jul 15, 2020 at 04:49:48PM +0200, Adrian Reber wrote:
> This is v5 of the 'Introduce CAP_CHECKPOINT_RESTORE' patchset. The
> changes to v4 are:
> 
>  * split into more patches to have the introduction of
>CAP_CHECKPOINT_RESTORE and the actual usage in different
>patches
>  * reduce the /proc/self/exe patch to only be about
>CAP_CHECKPOINT_RESTORE
> 
> Adrian Reber (5):
>   capabilities: Introduce CAP_CHECKPOINT_RESTORE
>   pid: use checkpoint_restore_ns_capable() for set_tid
>   pid_namespace: use checkpoint_restore_ns_capable() for ns_last_pid
>   proc: allow access in init userns for map_files with CAP_CHECKPOINT_RESTORE
>   selftests: add clone3() CAP_CHECKPOINT_RESTORE test
> 
> Nicolas Viennot (1):
>   prctl: Allow checkpoint/restore capable processes to change exe link

(This is probably bad form, but)  All

Reviewed-by: Serge Hallyn 

Assuming you changes patches 4 and 6 per Christian's suggestions,
I'd like to re-review those then.

> 
>  fs/proc/base.c|   8 +-
>  include/linux/capability.h|   6 +
>  include/uapi/linux/capability.h   |   9 +-
>  kernel/pid.c  |   2 +-
>  kernel/pid_namespace.c|   2 +-
>  kernel/sys.c  |  12 +-
>  security/selinux/include/classmap.h   |   5 +-
>  tools/testing/selftests/clone3/Makefile   |   4 +-
>  .../clone3/clone3_cap_checkpoint_restore.c| 203 ++
>  9 files changed, 236 insertions(+), 15 deletions(-)
>  create mode 100644 
> tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c
> 
> 
> base-commit: d31958b30ea3b7b6e522d6bf449427748ad45822
> -- 
> 2.26.2


Re: [PATCH] LSM: drop duplicated words in header file comments

2020-07-17 Thread Serge E. Hallyn
On Fri, Jul 17, 2020 at 04:36:40PM -0700, Randy Dunlap wrote:
> From: Randy Dunlap 
> 
> Drop the doubled words "the" and "and" in comments.
> 
> Signed-off-by: Randy Dunlap 
> Cc: James Morris 
> Cc: "Serge E. Hallyn" 

Acked-by: Serge Hallyn 

> Cc: linux-security-mod...@vger.kernel.org
> ---
>  include/linux/lsm_hook_defs.h |2 +-
>  include/linux/lsm_hooks.h |2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> --- linux-next-20200714.orig/include/linux/lsm_hook_defs.h
> +++ linux-next-20200714/include/linux/lsm_hook_defs.h
> @@ -15,7 +15,7 @@
>   */
>  
>  /*
> - * The macro LSM_HOOK is used to define the data structures required by the
> + * The macro LSM_HOOK is used to define the data structures required by
>   * the LSM framework using the pattern:
>   *
>   *   LSM_HOOK(, , , args...)
> --- linux-next-20200714.orig/include/linux/lsm_hooks.h
> +++ linux-next-20200714/include/linux/lsm_hooks.h
> @@ -822,7 +822,7 @@
>   *   structure. Note that the security field was not added directly to the
>   *   socket structure, but rather, the socket security information is stored
>   *   in the associated inode.  Typically, the inode alloc_security hook will
> - *   allocate and and attach security information to
> + *   allocate and attach security information to
>   *   SOCK_INODE(sock)->i_security.  This hook may be used to update the
>   *   SOCK_INODE(sock)->i_security field with additional information that
>   *   wasn't available when the inode was allocated.
> 


Re: [PATCH] capabilities: Replace HTTP links with HTTPS ones

2020-07-17 Thread Serge E. Hallyn
On Mon, Jul 13, 2020 at 12:34:28PM +0200, Alexander A. Klimov wrote:
> Rationale:
> Reduces attack surface on kernel devs opening the links for MITM
> as HTTPS traffic is much harder to manipulate.
> 
> Deterministic algorithm:
> For each file:
>   If not .svg:
> For each line:
>   If doesn't contain `\bxmlns\b`:
> For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`:
> If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`:
> If both the HTTP and HTTPS versions
> return 200 OK and serve the same content:
>   Replace HTTP with HTTPS.
> 
> Signed-off-by: Alexander A. Klimov 

Reviewed-by: Serge Hallyn 

> ---
>  Continuing my work started at 93431e0607e5.
>  See also: git log --oneline '--author=Alexander A. Klimov 
> ' v5.7..master
>  (Actually letting a shell for loop submit all this stuff for me.)
> 
>  If there are any URLs to be removed completely or at least not just 
> HTTPSified:
>  Just clearly say so and I'll *undo my change*.
>  See also: https://lkml.org/lkml/2020/6/27/64
> 
>  If there are any valid, but yet not changed URLs:
>  See: https://lkml.org/lkml/2020/6/26/837
> 
>  If you apply the patch, please let me know.
> 
>  Sorry again to all maintainers who complained about subject lines.
>  Now I realized that you want an actually perfect prefixes,
>  not just subsystem ones.
>  I tried my best...
>  And yes, *I could* (at least half-)automate it.
>  Impossible is nothing! :)
> 
> 
>  kernel/capability.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/capability.c b/kernel/capability.c
> index 1444f3954d75..a8a20ebc43ee 100644
> --- a/kernel/capability.c
> +++ b/kernel/capability.c
> @@ -40,7 +40,7 @@ __setup("no_file_caps", file_caps_disable);
>  /*
>   * More recent versions of libcap are available from:
>   *
> - *   http://www.kernel.org/pub/linux/libs/security/linux-privs/
> + *   https://www.kernel.org/pub/linux/libs/security/linux-privs/
>   */
>  
>  static void warn_legacy_capability_use(void)
> -- 
> 2.27.0


Re: [PATCH v4 2/3] selftests: add clone3() CAP_CHECKPOINT_RESTORE test

2020-07-03 Thread Serge E. Hallyn
On Fri, Jul 03, 2020 at 01:18:07PM +0200, Adrian Reber wrote:
> On Thu, Jul 02, 2020 at 03:53:05PM -0500, Serge E. Hallyn wrote:
> > On Wed, Jul 01, 2020 at 08:49:05AM +0200, Adrian Reber wrote:
> > > This adds a test that changes its UID, uses capabilities to
> > > get CAP_CHECKPOINT_RESTORE and uses clone3() with set_tid to
> > > create a process with a given PID as non-root.
> > 
> > Seems worth also verifying that it fails if you have no capabilities.
> > I don't see that in the existing clone3/ test dir.
> 
> Bit confused about what you mean. This test does:
> 
>  * switch UID to 1000
>  * run clone3() with set_tid set and expect EPERM
>  * set CAP_CHECKPOINT_RESTORE capability
>  * run clone3() with set_tid set and expect success
> 
> So it already does what I think you are asking for. Did I misunderstand
> your comment?

Ah, no, I missed that line doing the call with -EPERM.  Thanks!

Acked-by: Serge Hallyn 


>   Adrian
> 
> > > Signed-off-by: Adrian Reber 
> > > ---
> > >  tools/testing/selftests/clone3/Makefile   |   4 +-
> > >  .../clone3/clone3_cap_checkpoint_restore.c| 203 ++
> > >  2 files changed, 206 insertions(+), 1 deletion(-)
> > >  create mode 100644 
> > > tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c
> > > 
> > > diff --git a/tools/testing/selftests/clone3/Makefile 
> > > b/tools/testing/selftests/clone3/Makefile
> > > index cf976c732906..ef7564cb7abe 100644
> > > --- a/tools/testing/selftests/clone3/Makefile
> > > +++ b/tools/testing/selftests/clone3/Makefile
> > > @@ -1,6 +1,8 @@
> > >  # SPDX-License-Identifier: GPL-2.0
> > >  CFLAGS += -g -I../../../../usr/include/
> > > +LDLIBS += -lcap
> > >  
> > > -TEST_GEN_PROGS := clone3 clone3_clear_sighand clone3_set_tid
> > > +TEST_GEN_PROGS := clone3 clone3_clear_sighand clone3_set_tid \
> > > + clone3_cap_checkpoint_restore
> > >  
> > >  include ../lib.mk
> > > diff --git 
> > > a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c 
> > > b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c
> > > new file mode 100644
> > > index ..2cc3d57b91f2
> > > --- /dev/null
> > > +++ b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c
> > > @@ -0,0 +1,203 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +
> > > +/*
> > > + * Based on Christian Brauner's clone3() example.
> > > + * These tests are assuming to be running in the host's
> > > + * PID namespace.
> > > + */
> > > +
> > > +/* capabilities related code based on selftests/bpf/test_verifier.c */
> > > +
> > > +#define _GNU_SOURCE
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +
> > > +#include "../kselftest.h"
> > > +#include "clone3_selftests.h"
> > > +
> > > +#ifndef MAX_PID_NS_LEVEL
> > > +#define MAX_PID_NS_LEVEL 32
> > > +#endif
> > > +
> > > +static void child_exit(int ret)
> > > +{
> > > + fflush(stdout);
> > > + fflush(stderr);
> > > + _exit(ret);
> > > +}
> > > +
> > > +static int call_clone3_set_tid(pid_t * set_tid, size_t set_tid_size)
> > > +{
> > > + int status;
> > > + pid_t pid = -1;
> > > +
> > > + struct clone_args args = {
> > > + .exit_signal = SIGCHLD,
> > > + .set_tid = ptr_to_u64(set_tid),
> > > + .set_tid_size = set_tid_size,
> > > + };
> > > +
> > > + pid = sys_clone3(, sizeof(struct clone_args));
> > > + if (pid < 0) {
> > > + ksft_print_msg("%s - Failed to create new process\n",
> > > +strerror(errno));
> > > + return -errno;
> > > + }
> > > +
> > > + if (pid == 0) {
> > > + int ret;
> > > + char tmp = 0;
> > > +
> > > + ksft_print_msg
> > > + ("I am the child, my PID is %d (expected %d)\n",
> > > +  getpid(), set_tid[0]);
> > > +
> > > + if (set_

Re: [PATCH v4 3/3] prctl: Allow ptrace capable processes to change /proc/self/exe

2020-07-02 Thread Serge E. Hallyn
On Wed, Jul 01, 2020 at 10:55:37AM +0200, Christian Brauner wrote:
> On Wed, Jul 01, 2020 at 08:49:06AM +0200, Adrian Reber wrote:
> > From: Nicolas Viennot 
> > 
> > Previously, the current process could only change the /proc/self/exe
> > link with local CAP_SYS_ADMIN.
> > This commit relaxes this restriction by permitting such change with
> > CAP_CHECKPOINT_RESTORE, and the ability to use ptrace.
> > 
> > With access to ptrace facilities, a process can do the following: fork a
> > child, execve() the target executable, and have the child use ptrace()
> > to replace the memory content of the current process. This technique
> > makes it possible to masquerade an arbitrary program as any executable,
> > even setuid ones.
> > 
> > Signed-off-by: Nicolas Viennot 
> > Signed-off-by: Adrian Reber 
> > ---
> >  include/linux/lsm_hook_defs.h |  1 +
> >  include/linux/security.h  |  6 ++
> >  kernel/sys.c  | 12 
> >  security/commoncap.c  | 26 ++
> >  security/security.c   |  5 +
> >  security/selinux/hooks.c  | 14 ++
> >  6 files changed, 56 insertions(+), 8 deletions(-)
> > 
> > diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
> > index 0098852bb56a..90e51d5e093b 100644
> > --- a/include/linux/lsm_hook_defs.h
> > +++ b/include/linux/lsm_hook_defs.h
> > @@ -211,6 +211,7 @@ LSM_HOOK(int, 0, task_kill, struct task_struct *p, 
> > struct kernel_siginfo *info,
> >  int sig, const struct cred *cred)
> >  LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2,
> >  unsigned long arg3, unsigned long arg4, unsigned long arg5)
> > +LSM_HOOK(int, 0, prctl_set_mm_exe_file, struct file *exe_file)
> >  LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p,
> >  struct inode *inode)
> >  LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag)
> > diff --git a/include/linux/security.h b/include/linux/security.h
> > index 2797e7f6418e..0f594eb7e766 100644
> > --- a/include/linux/security.h
> > +++ b/include/linux/security.h
> > @@ -412,6 +412,7 @@ int security_task_kill(struct task_struct *p, struct 
> > kernel_siginfo *info,
> > int sig, const struct cred *cred);
> >  int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
> > unsigned long arg4, unsigned long arg5);
> > +int security_prctl_set_mm_exe_file(struct file *exe_file);
> >  void security_task_to_inode(struct task_struct *p, struct inode *inode);
> >  int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag);
> >  void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid);
> > @@ -1124,6 +1125,11 @@ static inline int security_task_prctl(int option, 
> > unsigned long arg2,
> > return cap_task_prctl(option, arg2, arg3, arg4, arg5);
> >  }
> >  
> > +static inline int security_prctl_set_mm_exe_file(struct file *exe_file)
> > +{
> > +   return cap_prctl_set_mm_exe_file(exe_file);
> > +}
> > +
> >  static inline void security_task_to_inode(struct task_struct *p, struct 
> > inode *inode)
> >  { }
> >  
> > diff --git a/kernel/sys.c b/kernel/sys.c
> > index 00a96746e28a..bb53e8408c63 100644
> > --- a/kernel/sys.c
> > +++ b/kernel/sys.c
> > @@ -1851,6 +1851,10 @@ static int prctl_set_mm_exe_file(struct mm_struct 
> > *mm, unsigned int fd)
> > if (err)
> > goto exit;
> >  
> > +   err = security_prctl_set_mm_exe_file(exe.file);
> > +   if (err)
> > +   goto exit;
> > +
> > /*
> >  * Forbid mm->exe_file change if old file still mapped.
> >  */
> > @@ -2006,14 +2010,6 @@ static int prctl_set_mm_map(int opt, const void 
> > __user *addr, unsigned long data
> > }
> >  
> > if (prctl_map.exe_fd != (u32)-1) {
> > -   /*
> > -* Make sure the caller has the rights to
> > -* change /proc/pid/exe link: only local sys admin should
> > -* be allowed to.
> > -*/
> > -   if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> > -   return -EINVAL;
> > -
> > error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
> > if (error)
> > return error;
> > diff --git a/security/commoncap.c b/security/commoncap.c
> > index 59bf3c1674c8..663d00fe2ecc 100644
> > --- a/security/commoncap.c
> > +++ b/security/commoncap.c
> > @@ -1291,6 +1291,31 @@ int cap_task_prctl(int option, unsigned long arg2, 
> > unsigned long arg3,
> > }
> >  }
> >  
> > +/**
> > + * cap_prctl_set_mm_exe_file - Determine whether /proc/self/exe can be 
> > changed
> > + * by the current process.
> > + * @exe_file: The new exe file
> > + * Returns 0 if permission is granted, -ve if denied.
> > + *
> > + * The current process is permitted to change its /proc/self/exe link via 
> > two policies:
> > + * 1) The current user can do checkpoint/restore. At the time of this 
> > writing,
> > + *this means 

Re: [PATCH v4 3/3] prctl: Allow ptrace capable processes to change /proc/self/exe

2020-07-02 Thread Serge E. Hallyn
On Wed, Jul 01, 2020 at 08:49:06AM +0200, Adrian Reber wrote:
> From: Nicolas Viennot 
> 
> Previously, the current process could only change the /proc/self/exe
> link with local CAP_SYS_ADMIN.
> This commit relaxes this restriction by permitting such change with
> CAP_CHECKPOINT_RESTORE, and the ability to use ptrace.
> 
> With access to ptrace facilities, a process can do the following: fork a
> child, execve() the target executable, and have the child use ptrace()
> to replace the memory content of the current process. This technique
> makes it possible to masquerade an arbitrary program as any executable,
> even setuid ones.
> 
> Signed-off-by: Nicolas Viennot 
> Signed-off-by: Adrian Reber 

This is scary.  But I believe it is safe.

Reviewed-by: Serge Hallyn 

I am a bit curious about the implications of the selinux patch.
IIUC you are using the permission of the tracing process to
execute the file without transition, so this is a way to work
around the policy which might prevent the tracee from doing so.
Given that SELinux wants to be MAC, I'm not *quite* sure that's
considered kosher.  You also are skipping the PROCESS__PTRACE
to SECCLASS_PROCESS check which selinux_bprm_set_creds does later
on.  Again I'm just not quite sure what's considered normal there
these days.

Paul, do you have input there?

> ---
>  include/linux/lsm_hook_defs.h |  1 +
>  include/linux/security.h  |  6 ++
>  kernel/sys.c  | 12 
>  security/commoncap.c  | 26 ++
>  security/security.c   |  5 +
>  security/selinux/hooks.c  | 14 ++
>  6 files changed, 56 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
> index 0098852bb56a..90e51d5e093b 100644
> --- a/include/linux/lsm_hook_defs.h
> +++ b/include/linux/lsm_hook_defs.h
> @@ -211,6 +211,7 @@ LSM_HOOK(int, 0, task_kill, struct task_struct *p, struct 
> kernel_siginfo *info,
>int sig, const struct cred *cred)
>  LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2,
>unsigned long arg3, unsigned long arg4, unsigned long arg5)
> +LSM_HOOK(int, 0, prctl_set_mm_exe_file, struct file *exe_file)
>  LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p,
>struct inode *inode)
>  LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag)
> diff --git a/include/linux/security.h b/include/linux/security.h
> index 2797e7f6418e..0f594eb7e766 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -412,6 +412,7 @@ int security_task_kill(struct task_struct *p, struct 
> kernel_siginfo *info,
>   int sig, const struct cred *cred);
>  int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
>   unsigned long arg4, unsigned long arg5);
> +int security_prctl_set_mm_exe_file(struct file *exe_file);
>  void security_task_to_inode(struct task_struct *p, struct inode *inode);
>  int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag);
>  void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid);
> @@ -1124,6 +1125,11 @@ static inline int security_task_prctl(int option, 
> unsigned long arg2,
>   return cap_task_prctl(option, arg2, arg3, arg4, arg5);
>  }
>  
> +static inline int security_prctl_set_mm_exe_file(struct file *exe_file)
> +{
> + return cap_prctl_set_mm_exe_file(exe_file);
> +}
> +
>  static inline void security_task_to_inode(struct task_struct *p, struct 
> inode *inode)
>  { }
>  
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 00a96746e28a..bb53e8408c63 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1851,6 +1851,10 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, 
> unsigned int fd)
>   if (err)
>   goto exit;
>  
> + err = security_prctl_set_mm_exe_file(exe.file);
> + if (err)
> + goto exit;
> +
>   /*
>* Forbid mm->exe_file change if old file still mapped.
>*/
> @@ -2006,14 +2010,6 @@ static int prctl_set_mm_map(int opt, const void __user 
> *addr, unsigned long data
>   }
>  
>   if (prctl_map.exe_fd != (u32)-1) {
> - /*
> -  * Make sure the caller has the rights to
> -  * change /proc/pid/exe link: only local sys admin should
> -  * be allowed to.
> -  */
> - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> - return -EINVAL;
> -
>   error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
>   if (error)
>   return error;
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 59bf3c1674c8..663d00fe2ecc 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -1291,6 +1291,31 @@ int cap_task_prctl(int option, unsigned long arg2, 
> unsigned long arg3,
>   }
>  }
>  
> +/**
> + * cap_prctl_set_mm_exe_file - Determine 

Re: [PATCH v4 2/3] selftests: add clone3() CAP_CHECKPOINT_RESTORE test

2020-07-02 Thread Serge E. Hallyn
On Wed, Jul 01, 2020 at 08:49:05AM +0200, Adrian Reber wrote:
> This adds a test that changes its UID, uses capabilities to
> get CAP_CHECKPOINT_RESTORE and uses clone3() with set_tid to
> create a process with a given PID as non-root.

Seems worth also verifying that it fails if you have no capabilities.
I don't see that in the existing clone3/ test dir.


> Signed-off-by: Adrian Reber 
> ---
>  tools/testing/selftests/clone3/Makefile   |   4 +-
>  .../clone3/clone3_cap_checkpoint_restore.c| 203 ++
>  2 files changed, 206 insertions(+), 1 deletion(-)
>  create mode 100644 
> tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c
> 
> diff --git a/tools/testing/selftests/clone3/Makefile 
> b/tools/testing/selftests/clone3/Makefile
> index cf976c732906..ef7564cb7abe 100644
> --- a/tools/testing/selftests/clone3/Makefile
> +++ b/tools/testing/selftests/clone3/Makefile
> @@ -1,6 +1,8 @@
>  # SPDX-License-Identifier: GPL-2.0
>  CFLAGS += -g -I../../../../usr/include/
> +LDLIBS += -lcap
>  
> -TEST_GEN_PROGS := clone3 clone3_clear_sighand clone3_set_tid
> +TEST_GEN_PROGS := clone3 clone3_clear_sighand clone3_set_tid \
> + clone3_cap_checkpoint_restore
>  
>  include ../lib.mk
> diff --git a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c 
> b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c
> new file mode 100644
> index ..2cc3d57b91f2
> --- /dev/null
> +++ b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c
> @@ -0,0 +1,203 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Based on Christian Brauner's clone3() example.
> + * These tests are assuming to be running in the host's
> + * PID namespace.
> + */
> +
> +/* capabilities related code based on selftests/bpf/test_verifier.c */
> +
> +#define _GNU_SOURCE
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "../kselftest.h"
> +#include "clone3_selftests.h"
> +
> +#ifndef MAX_PID_NS_LEVEL
> +#define MAX_PID_NS_LEVEL 32
> +#endif
> +
> +static void child_exit(int ret)
> +{
> + fflush(stdout);
> + fflush(stderr);
> + _exit(ret);
> +}
> +
> +static int call_clone3_set_tid(pid_t * set_tid, size_t set_tid_size)
> +{
> + int status;
> + pid_t pid = -1;
> +
> + struct clone_args args = {
> + .exit_signal = SIGCHLD,
> + .set_tid = ptr_to_u64(set_tid),
> + .set_tid_size = set_tid_size,
> + };
> +
> + pid = sys_clone3(, sizeof(struct clone_args));
> + if (pid < 0) {
> + ksft_print_msg("%s - Failed to create new process\n",
> +strerror(errno));
> + return -errno;
> + }
> +
> + if (pid == 0) {
> + int ret;
> + char tmp = 0;
> +
> + ksft_print_msg
> + ("I am the child, my PID is %d (expected %d)\n",
> +  getpid(), set_tid[0]);
> +
> + if (set_tid[0] != getpid())
> + child_exit(EXIT_FAILURE);
> + child_exit(EXIT_SUCCESS);
> + }
> +
> + ksft_print_msg("I am the parent (%d). My child's pid is %d\n",
> +getpid(), pid);
> +
> + if (waitpid(pid, , 0) < 0) {
> + ksft_print_msg("Child returned %s\n", strerror(errno));
> + return -errno;
> + }
> +
> + if (!WIFEXITED(status))
> + return -1;
> +
> + return WEXITSTATUS(status);
> +}
> +
> +static int test_clone3_set_tid(pid_t * set_tid,
> +size_t set_tid_size, int expected)
> +{
> + int ret;
> +
> + ksft_print_msg("[%d] Trying clone3() with CLONE_SET_TID to %d\n",
> +getpid(), set_tid[0]);
> + ret = call_clone3_set_tid(set_tid, set_tid_size);
> +
> + ksft_print_msg
> + ("[%d] clone3() with CLONE_SET_TID %d says :%d - expected %d\n",
> +  getpid(), set_tid[0], ret, expected);
> + if (ret != expected) {
> + ksft_test_result_fail
> + ("[%d] Result (%d) is different than expected (%d)\n",
> +  getpid(), ret, expected);
> + return -1;
> + }
> + ksft_test_result_pass
> + ("[%d] Result (%d) matches expectation (%d)\n", getpid(), ret,
> +  expected);
> +
> + return 0;
> +}
> +
> +struct libcap {
> + struct __user_cap_header_struct hdr;
> + struct __user_cap_data_struct data[2];
> +};
> +
> +static int set_capability()
> +{
> + cap_value_t cap_values[] = { CAP_SETUID, CAP_SETGID };
> + struct libcap *cap;
> + int ret = -1;
> + cap_t caps;
> +
> + caps = cap_get_proc();
> + if (!caps) {
> + perror("cap_get_proc");
> + return -1;
> + }
> +
> + /* Drop all capabilities */
> + if (cap_clear(caps)) {
> + perror("cap_clear");
> + goto 

Re: [PATCH RESEND] device_cgroup: Fix RCU list debugging warning

2020-06-07 Thread Serge E. Hallyn
On Sun, Jun 07, 2020 at 12:08:40PM -0700, Paul E. McKenney wrote:
> On Sun, Jun 07, 2020 at 06:23:40AM +1000, Stephen Rothwell wrote:
> > Hi all,
> > 
> > On Mon, 6 Apr 2020 16:29:50 +0530 Amol Grover  wrote:
> > >
> > > exceptions may be traversed using list_for_each_entry_rcu()
> > > outside of an RCU read side critical section BUT under the
> > > protection of decgroup_mutex. Hence add the corresponding
> > > lockdep expression to fix the following false-positive
> > > warning:
> > > 
> > > [2.304417] =
> > > [2.304418] WARNING: suspicious RCU usage
> > > [2.304420] 5.5.4-stable #17 Tainted: GE
> > > [2.304422] -
> > > [2.304424] security/device_cgroup.c:355 RCU-list traversed in 
> > > non-reader section!!
> > > 
> > > Signed-off-by: Amol Grover 
> > > ---
> > >  security/device_cgroup.c | 3 ++-
> > >  1 file changed, 2 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/security/device_cgroup.c b/security/device_cgroup.c
> > > index 7d0f8f7431ff..b7da9e0970d9 100644
> > > --- a/security/device_cgroup.c
> > > +++ b/security/device_cgroup.c
> > > @@ -352,7 +352,8 @@ static bool match_exception_partial(struct list_head 
> > > *exceptions, short type,
> > >  {
> > >   struct dev_exception_item *ex;
> > >  
> > > - list_for_each_entry_rcu(ex, exceptions, list) {
> > > + list_for_each_entry_rcu(ex, exceptions, list,
> > > + lockdep_is_held(_mutex)) {
> > >   if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK))
> > >   continue;
> > >   if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR))
> > > -- 
> > > 2.24.1
> > > 
> > 
> > I have been carrying the above patch in linux-next for some time now.
> > I have been carrying it because it fixes problems for syzbot (see the
> > third warning in
> > https://lore.kernel.org/linux-next/cact4y+ynjk+kq0pfb5fe-q1bqe2t1jq_mvkhf--z80z3wky...@mail.gmail.com/).
> > Is there some reason it has not been applied to some tree?
> 
> The RCU changes on which this patch depends have long since made it to
> mainline, so it can go up any tree.  I can take it if no one else will,
> but it might be better going in via the security tree.
> 
>   Thanx, Paul

James, do you mind pulling it in?


[PATCH 1/1] shiftfs: specify struct members

2020-05-17 Thread Serge E. Hallyn
struct path is declared as randomize_layout, so specify the
struct members when initializing to avoid build failure.

Signed-off-by: Serge Hallyn 
---

[ this is for 
https://git.kernel.org/pub/scm/linux/kernel/git/jejb/binfmt_misc.git/commit/?h=shiftfs-v3
 ,
which i was just building for an experiment ]

 fs/shiftfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/shiftfs.c b/fs/shiftfs.c
index 7984a93745d2..6028244c2f42 100644
--- a/fs/shiftfs.c
+++ b/fs/shiftfs.c
@@ -535,7 +535,7 @@ static int shiftfs_getattr(const struct path *path, struct 
kstat *stat,
struct dentry *real = path->dentry->d_fsdata;
struct inode *reali = real->d_inode;
const struct inode_operations *iop = reali->i_op;
-   struct path newpath = { path->dentry->d_sb->s_fs_info, real };
+   struct path newpath = { .mnt = path->dentry->d_sb->s_fs_info, .dentry = 
real };
int err = 0;
 
if (iop->getattr)
-- 
2.17.1



Re: [PATCH v4 2/3] nsproxy: attach to namespaces via pidfds

2020-05-07 Thread Serge E. Hallyn
On Tue, May 05, 2020 at 04:04:31PM +0200, Christian Brauner wrote:
> For quite a while we have been thinking about using pidfds to attach to
> namespaces. This patchset has existed for about a year already but we've
> wanted to wait to see how the general api would be received and adopted.
> Now that more and more programs in userspace have started using pidfds
> for process management it's time to send this one out.
> 
> This patch makes it possible to use pidfds to attach to the namespaces
> of another process, i.e. they can be passed as the first argument to the
> setns() syscall. When only a single namespace type is specified the
> semantics are equivalent to passing an nsfd. That means
> setns(nsfd, CLONE_NEWNET) equals setns(pidfd, CLONE_NEWNET). However,
> when a pidfd is passed, multiple namespace flags can be specified in the
> second setns() argument and setns() will attach the caller to all the
> specified namespaces all at once or to none of them. Specifying 0 is not
> valid together with a pidfd.
> 
> Here are just two obvious examples:
> setns(pidfd, CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET);
> setns(pidfd, CLONE_NEWUSER);
> Allowing to also attach subsets of namespaces supports various use-cases
> where callers setns to a subset of namespaces to retain privilege, perform
> an action and then re-attach another subset of namespaces.
> 
> If the need arises, as Eric suggested, we can extend this patchset to
> assume even more context than just attaching all namespaces. His suggestion
> specifically was about assuming the process' root directory when
> setns(pidfd, 0) or setns(pidfd, SETNS_PIDFD) is specified. For now, just
> keep it flexible in terms of supporting subsets of namespaces but let's
> wait until we have users asking for even more context to be assumed. At
> that point we can add an extension.
> 
> The obvious example where this is useful is a standard container
> manager interacting with a running container: pushing and pulling files
> or directories, injecting mounts, attaching/execing any kind of process,
> managing network devices all these operations require attaching to all
> or at least multiple namespaces at the same time. Given that nowadays
> most containers are spawned with all namespaces enabled we're currently
> looking at at least 14 syscalls, 7 to open the /proc//ns/
> nsfds, another 7 to actually perform the namespace switch. With time
> namespaces we're looking at about 16 syscalls.
> (We could amortize the first 7 or 8 syscalls for opening the nsfds by
>  stashing them in each container's monitor process but that would mean
>  we need to send around those file descriptors through unix sockets
>  everytime we want to interact with the container or keep on-disk
>  state. Even in scenarios where a caller wants to join a particular
>  namespace in a particular order callers still profit from batching
>  other namespaces. That mostly applies to the user namespace but
>  all container runtimes I found join the user namespace first no matter
>  if it privileges or deprivileges the container similar to how unshare
>  behaves.)
> With pidfds this becomes a single syscall no matter how many namespaces
> are supposed to be attached to.
> 
> A decently designed, large-scale container manager usually isn't the
> parent of any of the containers it spawns so the containers don't die
> when it crashes or needs to update or reinitialize. This means that
> for the manager to interact with containers through pids is inherently
> racy especially on systems where the maximum pid number is not
> significicantly bumped. This is even more problematic since we often spawn
> and manage thousands or ten-thousands of containers. Interacting with a
> container through a pid thus can become risky quite quickly. Especially
> since we allow for an administrator to enable advanced features such as
> syscall interception where we're performing syscalls in lieu of the
> container. In all of those cases we use pidfds if they are available and
> we pass them around as stable references. Using them to setns() to the
> target process' namespaces is as reliable as using nsfds. Either the
> target process is already dead and we get ESRCH or we manage to attach
> to its namespaces but we can't accidently attach to another process'
> namespaces. So pidfds lend themselves to be used with this api.
> The other main advantage is that with this change the pidfd becomes the
> only relevant token for most container interactions and it's the only
> token we need to create and send around.
> 
> Apart from significiantly reducing the number of syscalls from double
> digit to single digit which is a decent reason post-spectre/meltdown
> this also allows to switch to a set of namespaces atomically, i.e.
> either attaching to all the specified namespaces succeeds or we fail. If
> we fail we haven't changed a single namespace. There are currently three
> namespaces that can fail (other than for ENOMEM which really is 

Re: [PATCH v4 1/3] nsproxy: add struct nsset

2020-05-07 Thread Serge E. Hallyn
On Tue, May 05, 2020 at 04:04:30PM +0200, Christian Brauner wrote:
> Add a simple struct nsset. It holds all necessary pieces to switch to a new
> set of namespaces without leaving a task in a half-switched state which we
> will make use of in the next patch. This patch switches the existing setns
> logic over without causing a change in setns() behavior. This brings
> setns() closer to how unshare() works(). The prepare_ns() function is
> responsible to prepare all necessary information. This has two reasons.
> First it minimizes dependencies between individual namespaces, i.e. all
> install handler can expect that all fields are properly initialized
> independent in what order they are called in. Second, this makes the code
> easier to maintain and easier to follow if it needs to be changed.
> 
> The prepare_ns() helper will only be switched over to use a flags argument
> in the next patch. Here it will still use nstype as a simple integer
> argument which was argued would be clearer. I'm not particularly
> opinionated about this if it really helps or not. The struct nsset itself
> already contains the flags field since its name already indicates that it
> can contain information required by different namespaces. None of this
> should have functional consequences.
> 
> Cc: Eric W. Biederman 
> Cc: Serge Hallyn 

Reviewed-by: Serge Hallyn 

Thanks, Christian.

> Cc: Jann Horn 
> Cc: Michael Kerrisk 
> Cc: Aleksa Sarai 
> Signed-off-by: Christian Brauner 
> ---
> /* v2 */
> patch introduced
> 
> /* v3 */
> - Eric W. Biederman :
>   - Remove the prior ns_capable_cred() patch and simplify the permission
> check from ns_capable_cred(nsset, nsset->cred->user_ns, CAP_SYS_ADMIN))
> to from ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)).
> 
> /* v4 */
> - Eric W. Biederman :
>   - Fix nstype == 0 case.
> ---
>  fs/namespace.c| 10 ++--
>  include/linux/mnt_namespace.h |  1 +
>  include/linux/nsproxy.h   | 24 ++
>  include/linux/proc_ns.h   |  4 +-
>  ipc/namespace.c   |  7 ++-
>  kernel/cgroup/namespace.c |  5 +-
>  kernel/nsproxy.c  | 90 ++-
>  kernel/pid_namespace.c|  5 +-
>  kernel/time/namespace.c   |  5 +-
>  kernel/user_namespace.c   |  8 ++--
>  kernel/utsname.c  |  5 +-
>  net/core/net_namespace.c  |  5 +-
>  12 files changed, 132 insertions(+), 37 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index a28e4db075ed..62899fad4a04 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -3954,16 +3954,18 @@ static void mntns_put(struct ns_common *ns)
>   put_mnt_ns(to_mnt_ns(ns));
>  }
>  
> -static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
> +static int mntns_install(struct nsset *nsset, struct ns_common *ns)
>  {
> - struct fs_struct *fs = current->fs;
> + struct nsproxy *nsproxy = nsset->nsproxy;
> + struct fs_struct *fs = nsset->fs;
>   struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
> + struct user_namespace *user_ns = nsset->cred->user_ns;
>   struct path root;
>   int err;
>  
>   if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
> - !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
> - !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> + !ns_capable(user_ns, CAP_SYS_CHROOT) ||
> + !ns_capable(user_ns, CAP_SYS_ADMIN))
>   return -EPERM;
>  
>   if (is_anon_ns(mnt_ns))
> diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
> index 35942084cd40..007cfa52efb2 100644
> --- a/include/linux/mnt_namespace.h
> +++ b/include/linux/mnt_namespace.h
> @@ -6,6 +6,7 @@
>  struct mnt_namespace;
>  struct fs_struct;
>  struct user_namespace;
> +struct ns_common;
>  
>  extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace 
> *,
>   struct user_namespace *, struct fs_struct *);
> diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
> index 074f395b9ad2..cdb171efc7cb 100644
> --- a/include/linux/nsproxy.h
> +++ b/include/linux/nsproxy.h
> @@ -41,6 +41,30 @@ struct nsproxy {
>  };
>  extern struct nsproxy init_nsproxy;
>  
> +/*
> + * A structure to encompass all bits needed to install
> + * a partial or complete new set of namespaces.
> + *
> + * If a new user namespace is requested cred will
> + * point to a modifiable set of credentials. If a pointer
> + * to a modifiable set is needed nsset_cred() must be
> + * used and tested.
> + */
> +struct nsset {
> + unsigned flags;
> + struct nsproxy *nsproxy;
> + struct fs_struct *fs;
> + const struct cred *cred;
> +};
> +
> +static inline struct cred *nsset_cred(struct nsset *set)
> +{
> + if (set->flags & CLONE_NEWUSER)
> + return (struct cred *)set->cred;
> +
> + return NULL;
> +}
> +
>  /*
>   * the namespaces access rules are:
>   *
> diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
> index 

Re: [PATCH v3 1/2] fork: add clone3

2019-06-06 Thread Serge E. Hallyn
On Tue, Jun 04, 2019 at 06:09:43PM +0200, Christian Brauner wrote:
> This adds the clone3 system call.
> 
> As mentioned several times already (cf. [7], [8]) here's the promised
> patchset for clone3().
> 
> We recently merged the CLONE_PIDFD patchset (cf. [1]). It took the last
> free flag from clone().
> 
> Independent of the CLONE_PIDFD patchset a time namespace has been discussed
> at Linux Plumber Conference last year and has been sent out and reviewed
> (cf. [5]). It is expected that it will go upstream in the not too distant
> future. However, it relies on the addition of the CLONE_NEWTIME flag to
> clone(). The only other good candidate - CLONE_DETACHED - is currently not
> recyclable as we have identified at least two large or widely used
> codebases that currently pass this flag (cf. [2], [3], and [4]). Given that
> CLONE_PIDFD grabbed the last clone() flag the time namespace is effectively
> blocked. clone3() has the advantage that it will unblock this patchset
> again. In general, clone3() is extensible and allows for the implementation
> of new features.
> 
> The idea is to keep clone3() very simple and close to the original clone(),
> specifically, to keep on supporting old clone()-based workloads.
> We know there have been various creative proposals how a new process
> creation syscall or even api is supposed to look like. Some people even
> going so far as to argue that the traditional fork()+exec() split should be
> abandoned in favor of an in-kernel version of spawn(). Independent of
> whether or not we personally think spawn() is a good idea this patchset has
> and does not want to have anything to do with this.
> One stance we take is that there's no real good alternative to
> clone()+exec() and we need and want to support this model going forward;
> independent of spawn().
> The following requirements guided clone3():
> - bump the number of available flags
> - move arguments that are currently passed as separate arguments
>   in clone() into a dedicated struct clone_args
>   - choose a struct layout that is easy to handle on 32 and on 64 bit
>   - choose a struct layout that is extensible
>   - give new flags that currently need to abuse another flag's dedicated
> return argument in clone() their own dedicated return argument
> (e.g. CLONE_PIDFD)
>   - use a separate kernel internal struct kernel_clone_args that is
> properly typed according to current kernel conventions in fork.c and is
> different from  the uapi struct clone_args
> - port _do_fork() to use kernel_clone_args so that all process creation
>   syscalls such as fork(), vfork(), clone(), and clone3() behave identical
>   (Arnd suggested, that we can probably also port do_fork() itself in a
>separate patchset.)
> - ease of transition for userspace from clone() to clone3()
>   This very much means that we do *not* remove functionality that userspace
>   currently relies on as the latter is a good way of creating a syscall
>   that won't be adopted.
> - do not try to be clever or complex: keep clone3() as dumb as possible
> 
> In accordance with Linus suggestions (cf. [11]), clone3() has the following
> signature:
> 
> /* uapi */
> struct clone_args {
> __aligned_u64 flags;
> __aligned_u64 pidfd;
> __aligned_u64 child_tid;
> __aligned_u64 parent_tid;
> __aligned_u64 exit_signal;
> __aligned_u64 stack;
> __aligned_u64 stack_size;
> __aligned_u64 tls;
> };
> 
> /* kernel internal */
> struct kernel_clone_args {
> u64 flags;
> int __user *pidfd;
> int __user *child_tid;
> int __user *parent_tid;
> int exit_signal;
> unsigned long stack;
> unsigned long stack_size;
> unsigned long tls;
> };
> 
> long sys_clone3(struct clone_args __user *uargs, size_t size)
> 
> clone3() cleanly supports all of the supported flags from clone() and thus
> all legacy workloads.
> The advantage of sticking close to the old clone() is the low cost for
> userspace to switch to this new api. Quite a lot of userspace apis (e.g.
> pthreads) are based on the clone() syscall. With the new clone3() syscall
> supporting all of the old workloads and opening up the ability to add new
> features should make switching to it for userspace more appealing. In
> essence, glibc can just write a simple wrapper to switch from clone() to
> clone3().
> 
> There has been some interest in this patchset already. We have received a
> patch from the CRIU corner for clone3() that would set the PID/TID of a
> restored process without /proc/sys/kernel/ns_last_pid to eliminate a race.
> 
> /* User visible differences to legacy clone() */
> - CLONE_DETACHED will cause EINVAL with clone3()
> - CSIGNAL is deprecated
>   It is superseeded by a dedicated "exit_signal" argument in struct
>   clone_args freeing up space for additional flags.
>   This is based on a suggestion from Andrei and Linus (cf. [9] and [10])
> 
> /* References */
> 

Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]

2019-05-04 Thread Serge E. Hallyn
On Mon, Apr 29, 2019 at 07:31:43PM +0200, Enrico Weigelt, metux IT consult 
wrote:

Argh.  Sorry, it seems your emails aren't making it into my inbox, only
my once-in-a-long-while-checked lkml folder.  Sorry again.

> On 29.04.19 17:49, Serge E. Hallyn wrote:
> 
> >> * all users are equal - no root at all. the only exception is the>>   
> >> initial process, which gets the kernel devices mounted into his>>
>  namespace.> > This does not match my understanding, but I'm most likely
> wrong.  (I thought> there was an actual 'host owner' uid, which mostly
> is only used for initial> process, but is basically root with a
> different name, and used far less.  No> uid transitions without factotem
> so that it *looked* like no root user).
> Not quite (IIRC). The hostowner is just the user who booted the machine,
> the initial process runs under this uname and gets the kernel devices
> bound into his namespace, so he can start fileservers on them.
> 
> Also the caphash device (the one you can create capabilities, eg. for
> user change, which then can be used via capuse device) can only be
> opened once - usually by the host factotum.
> 
> There really is no such thing like root user.
> 
> >> What I'd like to achieve on Linux:>>>> * unprivileged users can have their 
> >> own mount namespace, where
> they>>   can mount at will (maybe just 9P).> > No problem, you can do
> that now.
>
> But only within separate userns, IMHO. (and, when I last tried, plain

"Only within a separate userns" - but why does that matter?  It's just
a different uid mapping.

> users couldn't directly create their userns).

Plain users can definately create their own userns, directly.  On some
distros there is a kernel knob like

#cat /proc/sys/kernel/unprivileged_userns_clone
1

which when unset prevents unprivileged users creating a namespace.

> >> * but they still appear as the same normal users to the rest of the
> >>   system
> > 
> > No problem, you can do that now.
> 
> How exactly ? Did I miss something vital ?

By unsharing your namespace and writing the new uid mapping.  You can of
course only map your own uid without using any privileged helpers at all.
And it requires help from a second process, which does the writing to
the uid map file after the first process has unshared.  But you can do it.
For instance, using the nsexec.c at

https://github.com/fcicq/nsexec

You can:

Terminal 1:
shallyn@stp:~/src/nsexec$ ./nsexec -UWm
about to unshare with 1002
Press any key to exec (I am 31157)

Now in terminal 2:

Terminal 2:
shallyn@stp:~/src/nsexec$ echo "0 1000 1" > /proc/31157/uid_map
shallyn@stp:~/src/nsexec$ echo deny > /proc/31157/setgroups
shallyn@stp:~/src/nsexec$ echo "0 1000 1" > /proc/31157/gid_map

Then back in terminal 1:
# id
uid=0(root) gid=0(root) groups=0(root),65534(nogroup)
# mount --bind /etc /mnt
# echo $?
0
# ls /root
ls: cannot open directory '/root': Permission denied

To the rest of the system you look like uid 1000.  You could have
chosen uid 1000 in your new namespace, but then you couldn't mount.
Of course you can nest user namespaces so you could create another,
this time mapping uid 1000 so you look like 1000 to yourself as well.

> >> * 9p programs (compiled for Linux ABI) can run parallel to traditional
> >>   linux programs within the same user and sessions (eg. from a terminal,
> >>   i can call both the same way)
> >> * namespace modifications affect both equally (eg. I could run ff in
> >>   an own ns)
> > 
> > affect both of what equally?
> 
> mount / bind.
> 
> > That's exactly what user namespaces are for.  You can create a new
> > user namespace, using no privilege at all, with your current uid (i.e.
> > 1000) mapped to whatever uid you like; if you pick 0, then you can unshare 
> > all
> > the namespaces you like.  
> 
> But I don't like to appear as 'root' in here. I just wanna have my own
> filesystem namespace, nothing more.

Right.  As you know setuid makes that impossible, unfortunately.  That's
where nonewprivs shows promise.

> > Once you unshare mnt_ns, you can mount to your
> > heart's content.  To other processes on the host, your process is
> > uid 1000.
> 
> Is that the uid, I'm appearing to filesystems ?

Yes.

> > Regarding factotem, I agree that with the pidfd work going on etc, it's 
> > getting
> > more and more tempting to attempt a switch to that.  Looking back at my 
> > folder,
> > I see you posted a kernel patch for it.  I had done the same long ago.  
> > Happy to
> > work with you again on that, and put a simple daemon into shadow package, if
> > util-linux isn't deemed the far better place.
> 
> Yeah :)
> 
> 
> --mtx
> 
> -- 
> Enrico Weigelt, metux IT consult
> Free software and Linux embedded engineering
> i...@metux.net -- +49-151-27565287


Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]

2019-04-29 Thread Serge E. Hallyn
On Tue, Apr 16, 2019 at 08:32:50PM +0200, Enrico Weigelt, metux IT consult 
wrote:

(Sorry for the late reply, I had missed this one)

> On 15.04.19 17:50, Serge E. Hallyn wrote:
> 
> Hi,
> 
> >> I'm working on implementing plan9-like fs namespaces, where unprivileged>> 
> >> processes can change their own namespace at will. For that, certain>
> > Is there any place where we can see previous discussion about this?
> Yes, lkml and constainers list.
> It's stalled since few month, as I'm too busy w/ other things.
> 
> > If you have to disable suid anyway, then is there any reason why the> 
> > existing ability to do this in a private user namespace, with only>
> your own uid mapped (which you can do without any privilege) does> not
> suffice?  That was actually one of the main design goals of user>
> namespaces, to be able to clone(CLONE_NEWUSER), map your current uid,>
> then clone(CLONE_NEWNS) and bind mount at will.
> Well, it's not that easy ... maybe I should explain a bit more about how
> Plan9 works, and how I intent to map it into Linux:
> 
> * on plan9, anybody can alter his own fs namespace (bind and mount), as
>   well as spawning new ones
> * basically anything is coming from some fileserver - even devices
>   (eg. there is no such thing like device nodes)
> * access control is done by the individual fileservers, based on the
>   initial authentication (on connecting to the server, before mounting)

yes, so far I'm aware of this,

> * all users are equal - no root at all. the only exception is the
>   initial process, which gets the kernel devices mounted into his
>   namespace.

This does not match my understanding, but I'm most likely wrong.  (I thought
there was an actual 'host owner' uid, which mostly is only used for initial
process, but is basically root with a different name, and used far less.  No
uid transitions without factotem so that it *looked* like no root user).

> What I'd like to achieve on Linux:
> 
> * unprivileged users can have their own mount namespace, where they
>   can mount at will (maybe just 9P).

No problem, you can do that now.

> * but they still appear as the same normal users to the rest of the
>   system

No problem, you can do that now.

> * 9p programs (compiled for Linux ABI) can run parallel to traditional
>   linux programs within the same user and sessions (eg. from a terminal,
>   i can call both the same way)
> * namespace modifications affect both equally (eg. I could run ff in
>   an own ns)

affect both of what equally?

> * these namespaces exist as long as there's one process alive in here

That's sort of how it is now, except you can also pin the namespaces
with their fds.

> * creating a new ns can be done by unprivileged user

That's true now.

>  One of the things to make this work (w/o introducing a massive security
> hole) is disable suid for those processes (actually, one day i'd like to
> get rid of it completely, but that's another story).

That's exactly what user namespaces are for.  You can create a new
user namespace, using no privilege at all, with your current uid (i.e.
1000) mapped to whatever uid you like; if you pick 0, then you can unshare all
the namespaces you like.  Once you unshare mnt_ns, you can mount to your
heart's content.  To other processes on the host, your process is
uid 1000.  Host uid 0 is not mapped into your ns, so you cannot exploit
suid to host root.

Regarding factotem, I agree that with the pidfd work going on etc, it's getting
more and more tempting to attempt a switch to that.  Looking back at my folder,
I see you posted a kernel patch for it.  I had done the same long ago.  Happy to
work with you again on that, and put a simple daemon into shadow package, if
util-linux isn't deemed the far better place.

-serge


Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]

2019-04-15 Thread Serge E. Hallyn
On Mon, Apr 15, 2019 at 12:08:09PM +0200, Enrico Weigelt, metux IT consult 
wrote:
> On 14.04.19 22:14, Christian Brauner wrote:
> 
> Hi folks,
> 
> > This patchset makes it possible to retrieve pid file descriptors at
> > process creation time by introducing the new flag CLONE_PIDFD to the
> > clone() system call as previously discussed.
> 
> Sorry, for highjacking this thread, but I'm curious on what things to
> consider when introducing new CLONE_* flags.
> 
> The reason I'm asking is:
> 
> I'm working on implementing plan9-like fs namespaces, where unprivileged
> processes can change their own namespace at will. For that, certain

Is there any place where we can see previous discussion about this?

> traditional unix'ish things have to be disabled, most notably suid.

If you have to disable suid anyway, then is there any reason why the
existing ability to do this in a private user namespace, with only
your own uid mapped (which you can do without any privilege) does
not suffice?  That was actually one of the main design goals of user
namespaces, to be able to clone(CLONE_NEWUSER), map your current uid,
then clone(CLONE_NEWNS) and bind mount at will.

> As forbidding suid can be helpful in other scenarios, too, I thought
> about making this its own feature. Doing that switch on clone() seems
> a nice place for that, IMHO.
> 
> As there might be potentially even more CLONE_* flags in the future,
> and the bitmask size is limited, this raises the question on how to
> proceed with those flag additions in the future.
> 
> What's your thoughts on that ?
> 
> 
> --mtx
> 
> -- 
> Enrico Weigelt, metux IT consult
> Free software and Linux embedded engineering
> i...@metux.net -- +49-151-27565287


Re: Allowing mapping supplemental groups in user namespace?

2019-03-28 Thread Serge E. Hallyn
On Thu, Mar 28, 2019 at 11:30:52AM -0700, Dmitry Torokhov wrote:
> Hi Serge,
> 
> On Thu, Mar 28, 2019 at 11:05 AM Serge E. Hallyn  wrote:
> >
> > On Thu, Feb 28, 2019 at 11:27:38AM -0800, Dmitry Torokhov wrote:
> > > Hi Eric,
> > >
> > > Currently, unless caller has CAP_SETGID in parent namespace, we can
> > > only map effective group id in the new user namespace. Would it be
> > > possible to relax this rule to also allow mapping of supplemental
> > > groups (1:1) of the caller?
> > >
> > > Thanks.
> > >
> > > --
> > > Dmitry
> >
> > Hi,
> >
> > Is there a use case where adding those to /etc/subgid is onerous?
> > (There probably is, just would like to see yours)
> 
> We on Chrome OS limit number of suid binaries installed on the system,
> so newgidmap does not have necessary privileges to carry out this

 good goal in general so long as you don't take a few huge
monolithic suid binaries instad of more simpler ones :)

> operation. Also we are looking for a solution that we can use with our
> minijail package where spawning additional binary is challenging even
> if it was suid.

Ok.  So fwiw I think what you propose should be ok.  I think you should
post a patch to do it.  It's very possible that seeing that patch will
remind us of the reason why it *is* a bad idea, but seeing the patch may
be a required shock to elicit that memory.

-serge


Re: Allowing mapping supplemental groups in user namespace?

2019-03-28 Thread Serge E. Hallyn
On Thu, Feb 28, 2019 at 11:27:38AM -0800, Dmitry Torokhov wrote:
> Hi Eric,
> 
> Currently, unless caller has CAP_SETGID in parent namespace, we can
> only map effective group id in the new user namespace. Would it be
> possible to relax this rule to also allow mapping of supplemental
> groups (1:1) of the caller?
> 
> Thanks.
> 
> -- 
> Dmitry

Hi,

Is there a use case where adding those to /etc/subgid is onerous?
(There probably is, just would like to see yours)

thanks,
-serge


Re: pidfd design

2019-03-24 Thread Serge E. Hallyn
On Wed, Mar 20, 2019 at 12:29:31PM -0700, Daniel Colascione wrote:
> On Wed, Mar 20, 2019 at 11:52 AM Christian Brauner  
> wrote:
> > I really want to see Joel's pidfd_wait() patchset and have more people
> > review the actual code.
> 
> Sure. But it's also unpleasant to have people write code and then have
> to throw it away due to guessing incorrectly about unclear
> requirements.

No, it is not.  It is not unpleasant.  And it is useful.  It is the best way to
identify and resolve those incorrect guesses and unclear requirements.


Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android

2019-03-17 Thread Serge E. Hallyn
On Sun, Mar 17, 2019 at 10:11:10AM -0700, Daniel Colascione wrote:
> On Sun, Mar 17, 2019 at 9:35 AM Serge E. Hallyn  wrote:
> >
> > On Sun, Mar 17, 2019 at 12:42:40PM +0100, Christian Brauner wrote:
> > > On Sat, Mar 16, 2019 at 09:53:06PM -0400, Joel Fernandes wrote:
> > > > On Sat, Mar 16, 2019 at 12:37:18PM -0700, Suren Baghdasaryan wrote:
> > > > > On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner 
> > > > >  wrote:
> > > > > >
> > > > > > On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > > > > > > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan 
> > > > > > >  wrote:
> > > > > > > >
> > > > > > > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes 
> > > > > > > >  wrote:
> > > > > > > > >
> > > > > > > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner 
> > > > > > > > > wrote:
> > > > > > > > > [..]
> > > > > > > > > > > why do we want to add a new syscall (pidfd_wait) though? 
> > > > > > > > > > > Why not just use
> > > > > > > > > > > standard poll/epoll interface on the proc fd like Daniel 
> > > > > > > > > > > was suggesting.
> > > > > > > > > > > AFAIK, once the proc file is opened, the struct pid is 
> > > > > > > > > > > essentially pinned
> > > > > > > > > > > even though the proc number may be reused. Then the 
> > > > > > > > > > > caller can just poll.
> > > > > > > > > > > We can add a waitqueue to struct pid, and wake up any 
> > > > > > > > > > > waiters on process
> > > > > > > > > > > death (A quick look shows task_struct can be mapped to 
> > > > > > > > > > > its struct pid) and
> > > > > > > > > > > also possibly optimize it using Steve's TIF flag idea. No 
> > > > > > > > > > > new syscall is
> > > > > > > > > > > needed then, let me know if I missed something?
> > > > > > > > > >
> > > > > > > > > > Huh, I thought that Daniel was against the poll/epoll 
> > > > > > > > > > solution?
> > > > > > > > >
> > > > > > > > > Hmm, going through earlier threads, I believe so now. Here 
> > > > > > > > > was Daniel's
> > > > > > > > > reasoning about avoiding a notification about process death 
> > > > > > > > > through proc
> > > > > > > > > directory fd: 
> > > > > > > > > http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > > > > > > >
> > > > > > > > > May be a dedicated syscall for this would be cleaner after 
> > > > > > > > > all.
> > > > > > > >
> > > > > > > > Ah, I wish I've seen that discussion before...
> > > > > > > > syscall makes sense and it can be non-blocking and we can use
> > > > > > > > select/poll/epoll if we use eventfd.
> > > > > > >
> > > > > > > Thanks for taking a look.
> > > > > > >
> > > > > > > > I would strongly advocate for
> > > > > > > > non-blocking version or at least to have a non-blocking option.
> > > > > > >
> > > > > > > Waiting for FD readiness is *already* blocking or non-blocking
> > > > > > > according to the caller's desire --- users can pass options they 
> > > > > > > want
> > > > > > > to poll(2) or whatever. There's no need for any kind of special
> > > > > > > configuration knob or non-blocking option. We already *have* a
> > > > > > > non-blocking option that works universally for everything.
> > > > > > >
> > > > > > > As I mentioned in the linked thread, waiting for process exit 
> > > > > > > should
> > > > > > > work just like waiting for bytes to appear on a pipe. Process exit
> > > &

Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android

2019-03-17 Thread Serge E. Hallyn
On Sun, Mar 17, 2019 at 12:42:40PM +0100, Christian Brauner wrote:
> On Sat, Mar 16, 2019 at 09:53:06PM -0400, Joel Fernandes wrote:
> > On Sat, Mar 16, 2019 at 12:37:18PM -0700, Suren Baghdasaryan wrote:
> > > On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner  
> > > wrote:
> > > >
> > > > On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > > > > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan 
> > > > >  wrote:
> > > > > >
> > > > > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes 
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > > > > > [..]
> > > > > > > > > why do we want to add a new syscall (pidfd_wait) though? Why 
> > > > > > > > > not just use
> > > > > > > > > standard poll/epoll interface on the proc fd like Daniel was 
> > > > > > > > > suggesting.
> > > > > > > > > AFAIK, once the proc file is opened, the struct pid is 
> > > > > > > > > essentially pinned
> > > > > > > > > even though the proc number may be reused. Then the caller 
> > > > > > > > > can just poll.
> > > > > > > > > We can add a waitqueue to struct pid, and wake up any waiters 
> > > > > > > > > on process
> > > > > > > > > death (A quick look shows task_struct can be mapped to its 
> > > > > > > > > struct pid) and
> > > > > > > > > also possibly optimize it using Steve's TIF flag idea. No new 
> > > > > > > > > syscall is
> > > > > > > > > needed then, let me know if I missed something?
> > > > > > > >
> > > > > > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > > > > > >
> > > > > > > Hmm, going through earlier threads, I believe so now. Here was 
> > > > > > > Daniel's
> > > > > > > reasoning about avoiding a notification about process death 
> > > > > > > through proc
> > > > > > > directory fd: 
> > > > > > > http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > > > > >
> > > > > > > May be a dedicated syscall for this would be cleaner after all.
> > > > > >
> > > > > > Ah, I wish I've seen that discussion before...
> > > > > > syscall makes sense and it can be non-blocking and we can use
> > > > > > select/poll/epoll if we use eventfd.
> > > > >
> > > > > Thanks for taking a look.
> > > > >
> > > > > > I would strongly advocate for
> > > > > > non-blocking version or at least to have a non-blocking option.
> > > > >
> > > > > Waiting for FD readiness is *already* blocking or non-blocking
> > > > > according to the caller's desire --- users can pass options they want
> > > > > to poll(2) or whatever. There's no need for any kind of special
> > > > > configuration knob or non-blocking option. We already *have* a
> > > > > non-blocking option that works universally for everything.
> > > > >
> > > > > As I mentioned in the linked thread, waiting for process exit should
> > > > > work just like waiting for bytes to appear on a pipe. Process exit
> > > > > status is just another blob of bytes that a process might receive. A
> > > > > process exit handle ought to be just another information source. The
> > > > > reason the unix process API is so awful is that for whatever reason
> > > > > the original designers treated processes as some kind of special kind
> > > > > of resource instead of fitting them into the otherwise general-purpose
> > > > > unix data-handling API. Let's not repeat that mistake.
> > > > >
> > > > > > Something like this:
> > > > > >
> > > > > > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > > > > > // register eventfd to receive death notification
> > > > > > pidfd_wait(pid_to_kill, evfd);
> > > > > > // kill the process
> > > > > > pidfd_send_signal(pid_to_kill, ...)
> > > > > > // tend to other things
> > > > >
> > > > > Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> > > > > an eventfd.
> > > > >
> > > 
> > > Ok, I probably misunderstood your post linked by Joel. I though your
> > > original proposal was based on being able to poll a file under
> > > /proc/pid and then you changed your mind to have a separate syscall
> > > which I assumed would be a blocking one to wait for process exit.
> > > Maybe you can describe the new interface you are thinking about in
> > > terms of userspace usage like I did above? Several lines of code would
> > > explain more than paragraphs of text.
> > 
> > Hey, Thanks Suren for the eventfd idea. I agree with Daniel on this. The 
> > idea
> > from Daniel here is to wait for process death and exit events by just
> > referring to a stable fd, independent of whatever is going on in /proc.
> > 
> > What is needed is something like this (in highly pseudo-code form):
> > 
> > pidfd = opendir("/proc/",..);
> > wait_fd = pidfd_wait(pidfd);
> > read or poll wait_fd (non-blocking or blocking whichever)
> > 
> > wait_fd will block until the task has either died or reaped. In both these
> > cases, it can return a suitable string such as "dead" or "reaped" although 
> > an
> > integer with some predefined meaning is also Ok.
> 

Re: [PATCH 4.20 282/352] fs/proc/base.c: use ns_capable instead of capable for timerslack_ns

2019-02-11 Thread Serge E. Hallyn
On Mon, Feb 11, 2019 at 07:02:06PM -0600, Eric W. Biederman wrote:
> Greg Kroah-Hartman  writes:
> 
> > 4.20-stable review patch.  If anyone has any objections, please let me
> > know.
> 
> No objection.  But I think of this as a feature addition rather than a
> fix for something.  As a feature that we now allow something we
> previously did not does this qualify for a backport to stable?

Hi,

I had the exact same thought when I saw this this morning, and was planning
on replying tonight.

> It is probably no more harmful in this instance than adding PCI IDs to a
> driver.  So I am not worried.  I am curious the current guidelines
> are.
> 
> In most cases a small relaxation of permissions like this requires a lot
> of bug fixing as typically code protected by capable(CAP_XXX) has been
> written and tested assuming a trusted root user.  Those bug fixes are
> many times too large for a stable backport.
> 
> Eric
> 
> 
> > --
> >
> > [ Upstream commit 8da0b4f692c6d90b09c91f271517db746a22ff67 ]
> >
> > Access to timerslack_ns is controlled by a process having CAP_SYS_NICE
> > in its effective capability set, but the current check looks in the root
> > namespace instead of the process' user namespace.  Since a process is
> > allowed to do other activities controlled by CAP_SYS_NICE inside a
> > namespace, it should also be able to adjust timerslack_ns.
> >
> > Link: http://lkml.kernel.org/r/20181030180012.232896-1-bmgor...@google.com
> > Signed-off-by: Benjamin Gordon 
> > Acked-by: "Eric W. Biederman" 
> > Cc: John Stultz 
> > Cc: "Eric W. Biederman" 
> > Cc: Kees Cook 
> > Cc: "Serge E. Hallyn" 
> > Cc: Thomas Gleixner 
> > Cc: Arjan van de Ven 
> > Cc: Oren Laadan 
> > Cc: Ruchi Kandoi 
> > Cc: Rom Lemarchand 
> > Cc: Todd Kjos 
> > Cc: Colin Cross 
> > Cc: Nick Kralevich 
> > Cc: Dmitry Shmidt 
> > Cc: Elliott Hughes 
> > Cc: Alexey Dobriyan 
> > Signed-off-by: Andrew Morton 
> > Signed-off-by: Linus Torvalds 
> > Signed-off-by: Sasha Levin 
> > ---
> >  fs/proc/base.c | 12 +---
> >  1 file changed, 9 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index ce3465479447..98525af0953e 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -2356,10 +2356,13 @@ static ssize_t timerslack_ns_write(struct file 
> > *file, const char __user *buf,
> > return -ESRCH;
> >  
> > if (p != current) {
> > -   if (!capable(CAP_SYS_NICE)) {
> > +   rcu_read_lock();
> > +   if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
> > +   rcu_read_unlock();
> > count = -EPERM;
> > goto out;
> > }
> > +   rcu_read_unlock();
> >  
> > err = security_task_setscheduler(p);
> > if (err) {
> > @@ -2392,11 +2395,14 @@ static int timerslack_ns_show(struct seq_file *m, 
> > void *v)
> > return -ESRCH;
> >  
> > if (p != current) {
> > -
> > -   if (!capable(CAP_SYS_NICE)) {
> > +   rcu_read_lock();
> > +   if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
> > +   rcu_read_unlock();
> > err = -EPERM;
> > goto out;
> > }
> > +   rcu_read_unlock();
> > +
> > err = security_task_getscheduler(p);
> > if (err)
> > goto out;


Re: [PATCH ghak103 V1] audit: add support for fcaps v3

2019-01-24 Thread Serge E. Hallyn
On Wed, Jan 23, 2019 at 09:36:25PM -0500, Richard Guy Briggs wrote:
> V3 namespaced file capabilities were introduced in
> commit 8db6c34f1dbc ("Introduce v3 namespaced file capabilities")
> 
> Add support for these by adding the "frootid" field to the existing
> fcaps fields in the NAME and BPRM_FCAPS records.
> 
> Please see github issue
> https://github.com/linux-audit/audit-kernel/issues/103
> 
> Signed-off-by: Richard Guy Briggs 

Looks like good info to have,

Acked-by: Serge Hallyn 

> ---
> Passes audit-testsuite.
> 
>  include/linux/capability.h | 5 +++--
>  kernel/audit.c | 6 --
>  kernel/audit.h | 1 +
>  kernel/auditsc.c   | 4 
>  security/commoncap.c   | 2 ++
>  5 files changed, 14 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index f640dcbc880c..f6bb691547fd 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -14,7 +14,7 @@
>  #define _LINUX_CAPABILITY_H
>  
>  #include 
> -
> +#include 
>  
>  #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
>  #define _KERNEL_CAPABILITY_U32S_LINUX_CAPABILITY_U32S_3
> @@ -25,11 +25,12 @@
>   __u32 cap[_KERNEL_CAPABILITY_U32S];
>  } kernel_cap_t;
>  
> -/* exact same as vfs_cap_data but in cpu endian and always filled completely 
> */
> +/* exact same as vfs_ns_cap_data but in cpu endian and always filled 
> completely */
>  struct cpu_vfs_cap_data {
>   __u32 magic_etc;
>   kernel_cap_t permitted;
>   kernel_cap_t inheritable;
> + kuid_t rootid;
>  };
>  
>  #define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
> diff --git a/kernel/audit.c b/kernel/audit.c
> index ca55ccb46b76..6f5eeb658ccb 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -2083,8 +2083,9 @@ static void audit_log_fcaps(struct audit_buffer *ab, 
> struct audit_names *name)
>  {
>   audit_log_cap(ab, "cap_fp", >fcap.permitted);
>   audit_log_cap(ab, "cap_fi", >fcap.inheritable);
> - audit_log_format(ab, " cap_fe=%d cap_fver=%x",
> -  name->fcap.fE, name->fcap_ver);
> + audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d",
> +  name->fcap.fE, name->fcap_ver,
> +  from_kuid(_user_ns, name->fcap.rootid));
>  }
>  
>  static inline int audit_copy_fcaps(struct audit_names *name,
> @@ -2103,6 +2104,7 @@ static inline int audit_copy_fcaps(struct audit_names 
> *name,
>   name->fcap.permitted = caps.permitted;
>   name->fcap.inheritable = caps.inheritable;
>   name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
> + name->fcap.rootid = caps.rootid;
>   name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
>   VFS_CAP_REVISION_SHIFT;
>  
> diff --git a/kernel/audit.h b/kernel/audit.h
> index 6ffb70575082..deefdbe61a47 100644
> --- a/kernel/audit.h
> +++ b/kernel/audit.h
> @@ -69,6 +69,7 @@ struct audit_cap_data {
>   kernel_cap_teffective;  /* effective set of process */
>   };
>   kernel_cap_tambient;
> + kuid_t  rootid;
>  };
>  
>  /* When fs/namei.c:getname() is called, we store the pointer in name and bump
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index b585ceb2f7a2..461c52eff870 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -1358,6 +1358,9 @@ static void audit_log_exit(void)
>   audit_log_cap(ab, "pi", >new_pcap.inheritable);
>   audit_log_cap(ab, "pe", >new_pcap.effective);
>   audit_log_cap(ab, "pa", >new_pcap.ambient);
> + audit_log_format(ab, " frootid=%d",
> +  from_kuid(_user_ns,
> +axs->fcap.rootid));
>   break; }
>  
>   }
> @@ -2355,6 +2358,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
>   ax->fcap.permitted = vcaps.permitted;
>   ax->fcap.inheritable = vcaps.inheritable;
>   ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
> + ax->fcap.rootid = vcaps.rootid;
>   ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> 
> VFS_CAP_REVISION_SHIFT;
>  
>   ax->old_pcap.permitted   = old->cap_permitted;
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 232db019f051..c097f3568001 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -643,6 +643,8 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, 
> struct cpu_vfs_cap_data
>   cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
>   cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
>  
> + cpu_caps->rootid = rootkuid;
> +
>   return 0;
>  }
>  
> -- 
> 1.8.3.1


Re: [PATCH v7 2/2] selftests: add tests for pidfd_send_signal()

2019-01-08 Thread Serge E. Hallyn
On Tue, Jan 08, 2019 at 11:20:23AM -0700, Tycho Andersen wrote:
> On Tue, Jan 08, 2019 at 12:17:42PM -0600, Serge E. Hallyn wrote:
> > On Tue, Jan 08, 2019 at 10:58:43AM -0700, Tycho Andersen wrote:
> > > On Tue, Jan 08, 2019 at 11:54:15AM -0600, Serge E. Hallyn wrote:
> > > > On Tue, Jan 08, 2019 at 10:53:06AM -0700, Tycho Andersen wrote:
> > > > > On Wed, Jan 02, 2019 at 05:16:54PM +0100, Christian Brauner wrote:
> > > > > > +   /*
> > > > > > +* Stop the child so we can inspect whether we 
> > > > > > have
> > > > > > +* recycled pid PID_RECYCLE.
> > > > > > +*/
> > > > > > +   close(pipe_fds[0]);
> > > > > > +   ret = kill(recycled_pid, SIGSTOP);
> > > > > > +   close(pipe_fds[1]);
> > > > > > +   if (ret) {
> > > > > > +   (void)wait_for_pid(recycled_pid);
> > > > > > +   _exit(PIDFD_ERROR);
> > > > > > +   }
> > > > > 
> > > > > Sorry for being late to the party, but I wonder if this whole thing
> > > > > couldn't be simplified with /proc/sys/kenrel/ns_last_pid?
> > > > 
> > > > no, bc it's not namespaced :)
> > > 
> > > Huh? It looks like it is...
> > > 
> > > static int pid_ns_ctl_handler(struct ctl_table *table, int write,
> > > void __user *buffer, size_t *lenp, loff_t *ppos)
> > > {
> > > struct pid_namespace *pid_ns = task_active_pid_ns(current);
> > > struct ctl_table tmp = *table;
> > > int ret, next;
> > > 
> > > if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> > > return -EPERM;
> > > 
> > > ...
> > 
> > Oh - hah, but that's ns_last_pid.  You'd want pid_max.  And that one
> > is not namespaced.
> 
> Perhaps I'm misunderstanding, but isn't the point of all this code to
> get the same pid again? So can't we just fork(), kill(), then set
> ns_last_pid to pid-1, and fork() again to re-use?

Oh yeah that would work :)

I was stuck on the idea of just limiting the range of pids.


Re: [PATCH v7 2/2] selftests: add tests for pidfd_send_signal()

2019-01-08 Thread Serge E. Hallyn
On Tue, Jan 08, 2019 at 10:58:43AM -0700, Tycho Andersen wrote:
> On Tue, Jan 08, 2019 at 11:54:15AM -0600, Serge E. Hallyn wrote:
> > On Tue, Jan 08, 2019 at 10:53:06AM -0700, Tycho Andersen wrote:
> > > On Wed, Jan 02, 2019 at 05:16:54PM +0100, Christian Brauner wrote:
> > > > +   /*
> > > > +* Stop the child so we can inspect whether we 
> > > > have
> > > > +* recycled pid PID_RECYCLE.
> > > > +*/
> > > > +   close(pipe_fds[0]);
> > > > +   ret = kill(recycled_pid, SIGSTOP);
> > > > +   close(pipe_fds[1]);
> > > > +   if (ret) {
> > > > +   (void)wait_for_pid(recycled_pid);
> > > > +   _exit(PIDFD_ERROR);
> > > > +   }
> > > 
> > > Sorry for being late to the party, but I wonder if this whole thing
> > > couldn't be simplified with /proc/sys/kenrel/ns_last_pid?
> > 
> > no, bc it's not namespaced :)
> 
> Huh? It looks like it is...
> 
> static int pid_ns_ctl_handler(struct ctl_table *table, int write,
> void __user *buffer, size_t *lenp, loff_t *ppos)
> {
> struct pid_namespace *pid_ns = task_active_pid_ns(current);
> struct ctl_table tmp = *table;
> int ret, next;
> 
> if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> return -EPERM;
> 
> ...

Oh - hah, but that's ns_last_pid.  You'd want pid_max.  And that one
is not namespaced.


Re: [PATCH v7 2/2] selftests: add tests for pidfd_send_signal()

2019-01-08 Thread Serge E. Hallyn
On Tue, Jan 08, 2019 at 10:53:06AM -0700, Tycho Andersen wrote:
> On Wed, Jan 02, 2019 at 05:16:54PM +0100, Christian Brauner wrote:
> > +   /*
> > +* Stop the child so we can inspect whether we have
> > +* recycled pid PID_RECYCLE.
> > +*/
> > +   close(pipe_fds[0]);
> > +   ret = kill(recycled_pid, SIGSTOP);
> > +   close(pipe_fds[1]);
> > +   if (ret) {
> > +   (void)wait_for_pid(recycled_pid);
> > +   _exit(PIDFD_ERROR);
> > +   }
> 
> Sorry for being late to the party, but I wonder if this whole thing
> couldn't be simplified with /proc/sys/kenrel/ns_last_pid?

no, bc it's not namespaced :)


Re: [PATCH v6 2/2] selftests: add tests for pidfd_send_signal()

2019-01-01 Thread Serge E. Hallyn
On Tue, Jan 01, 2019 at 04:07:44PM +0100, Christian Brauner wrote:
> On Mon, Dec 31, 2018 at 12:27:13AM +0100, Christian Brauner wrote:
> > On Sun, Dec 30, 2018 at 03:02:45PM -0600, Serge Hallyn wrote:
> > > On Sat, Dec 29, 2018 at 11:27:56PM +0100, Christian Brauner wrote:
> > > > As suggested by Andrew Morton in [1] add selftests for the new
> > > > sys_pidfd_send_signal() syscall.
> > > > This tests whether we can send a signal to an existing process and 
> > > > whether
> > > > sending a signal to a process that has already exited fails with ESRCH.
> > > > 
> > > > [1]: 
> > > > https://lore.kernel.org/lkml/20181228152012.dbf0508c2508138efc5f2...@linux-foundation.org/
> > > > 
> > > > Cc: Arnd Bergmann 
> > > > Cc: "Eric W. Biederman" 
> > > > Cc: Kees Cook 
> > > > Cc: Serge Hallyn 
> > > 
> > > Acked-by: Serge Hallyn 
> > > 
> > > Not saying you need to do this, but it would be neat if you could test
> > > sending to a pid which has been recycled :)
> > 
> > Yeah, I thought about it but it's a little weird code. First of all, we
> > can't set /proc/sys/kernel/pid_max to a very low value since this is a
> > system wide setting. So we need to recycle a lot via fork(). Something
> > along the lines of:
> > - unshare pid namespace
> > - fork to create pid 1 in new pid namespace
> > - cycle with fork() until pid > 300 since pids lower than 300 are
> >   reserved by the kernel.
> >   (That means if we simply use the first fork() after we created pid 1 we
> >   would never be able to recycle the pid since we skip over it. :))
> > - get pidfd to the pid > 300 we just created
> > - wait on the pid > 300
> > - cycle via fork() until we have reached the same pid > 300 again
> > - send SIGSTOP to that recycled process
> > - test that we cannot send SIGCONT to this SIGSTOPed task via the pidfd we
> >   received before
> > - send SIGCONT to the SIGSTOPed recycled pid and exit
> 
> Ok, I have something like this in my tree now that tests for pid
> recycling. I'm going to send it out tomorrow since I reckon Andrew and
> others will be off today.
> But fwiw it sits in 
> https://github.com/brauner/linux/commits/2018-12-02/procfds

Thanks, that shows off the advantages of the new syscall :)

-serge


Re: [PATCH v6 2/2] selftests: add tests for pidfd_send_signal()

2018-12-30 Thread Serge E. Hallyn
On Sat, Dec 29, 2018 at 11:27:56PM +0100, Christian Brauner wrote:
> As suggested by Andrew Morton in [1] add selftests for the new
> sys_pidfd_send_signal() syscall.
> This tests whether we can send a signal to an existing process and whether
> sending a signal to a process that has already exited fails with ESRCH.
> 
> [1]: 
> https://lore.kernel.org/lkml/20181228152012.dbf0508c2508138efc5f2...@linux-foundation.org/
> 
> Cc: Arnd Bergmann 
> Cc: "Eric W. Biederman" 
> Cc: Kees Cook 
> Cc: Serge Hallyn 

Acked-by: Serge Hallyn 

Not saying you need to do this, but it would be neat if you could test
sending to a pid which has been recycled :)

> Cc: Jann Horn 
> Cc: Andy Lutomirsky 
> Cc: Andrew Morton 
> Cc: Oleg Nesterov 
> Cc: Aleksa Sarai 
> Cc: Al Viro 
> Cc: Florian Weimer 
> Signed-off-by: Christian Brauner 
> ---
> /* Changelog */
> v6:
> - patch introduced
> v5..v0:
> - patch not present
> ---
>  tools/testing/selftests/Makefile   |   1 +
>  tools/testing/selftests/pidfd/Makefile |   6 +
>  tools/testing/selftests/pidfd/pidfd_test.c | 130 +
>  3 files changed, 137 insertions(+)
>  create mode 100644 tools/testing/selftests/pidfd/Makefile
>  create mode 100644 tools/testing/selftests/pidfd/pidfd_test.c
> 
> diff --git a/tools/testing/selftests/Makefile 
> b/tools/testing/selftests/Makefile
> index 24b9934fb269..63b0d8a0ebf7 100644
> --- a/tools/testing/selftests/Makefile
> +++ b/tools/testing/selftests/Makefile
> @@ -27,6 +27,7 @@ TARGETS += net
>  TARGETS += netfilter
>  TARGETS += networking/timestamping
>  TARGETS += nsfs
> +TARGETS += pidfd
>  TARGETS += powerpc
>  TARGETS += proc
>  TARGETS += pstore
> diff --git a/tools/testing/selftests/pidfd/Makefile 
> b/tools/testing/selftests/pidfd/Makefile
> new file mode 100644
> index ..deaf8073bc06
> --- /dev/null
> +++ b/tools/testing/selftests/pidfd/Makefile
> @@ -0,0 +1,6 @@
> +CFLAGS += -g -I../../../../usr/include/
> +
> +TEST_GEN_PROGS := pidfd_test
> +
> +include ../lib.mk
> +
> diff --git a/tools/testing/selftests/pidfd/pidfd_test.c 
> b/tools/testing/selftests/pidfd/pidfd_test.c
> new file mode 100644
> index ..edcd59979b10
> --- /dev/null
> +++ b/tools/testing/selftests/pidfd/pidfd_test.c
> @@ -0,0 +1,130 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#define _GNU_SOURCE
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "../kselftest.h"
> +
> +static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
> + unsigned int flags)
> +{
> + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
> +}
> +
> +static int signal_received;
> +
> +static void do_exit_success(int sig)
> +{
> + signal_received = 1;
> +}
> +
> +/*
> + * Straightforward test to see whether pidfd_send_signal() works is to send
> + * a signal to ourselves.
> + */
> +static int test_pidfd_send_signal_simple_success(void)
> +{
> + int pidfd, ret;
> + const char *test_name = "pidfd_send_signal send SIGUSR1";
> +
> + pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC);
> + if (pidfd < 0)
> + ksft_exit_fail_msg(
> + "%s test: Failed to open process file descriptor\n",
> + test_name);
> +
> + signal(SIGUSR1, do_exit_success);
> +
> + ret = sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0);
> + close(pidfd);
> + if (ret < 0)
> + ksft_exit_fail_msg("%s test: Failed to send signal\n",
> +test_name);
> +
> + if (signal_received != 1)
> + ksft_exit_fail_msg("%s test: Failed to receive signal\n",
> +test_name);
> +
> + signal_received = 0;
> + ksft_test_result_pass("%s test: Sent signal\n", test_name);
> + return 0;
> +}
> +
> +static void wait_for_pid(pid_t pid)
> +{
> + int status, ret;
> +
> +again:
> + ret = waitpid(pid, , 0);
> + if (ret == -1) {
> + if (errno == EINTR)
> + goto again;
> +
> + return;
> + }
> +
> + if (ret != pid)
> + goto again;
> +}
> +
> +static int test_pidfd_send_signal_exited_fail(void)
> +{
> + int pidfd, ret, saved_errno;
> + char buf[256];
> + pid_t pid;
> + const char *test_name = "pidfd_send_signal signal exited process";
> +
> + pid = fork();
> + if (pid < 0)
> + ksft_exit_fail_msg("%s test: Failed to create new process\n",
> +test_name);
> +
> + if (pid == 0)
> + _exit(EXIT_SUCCESS);
> +
> + snprintf(buf, sizeof(buf), "/proc/%d", pid);
> +
> + pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
> +
> + wait_for_pid(pid);
> +
> + if (pidfd < 0)
> + ksft_exit_fail_msg(
> + "%s test: Failed to open process file descriptor\n",
> + 

Re: [PATCH v5 1/1] signal: add pidfd_send_signal() syscall

2018-12-13 Thread Serge E. Hallyn
On Sat, Dec 08, 2018 at 06:40:59AM +0100, Christian Brauner wrote:
> The kill() syscall operates on process identifiers (pid). After a process
> has exited its pid can be reused by another process. If a caller sends a
> signal to a reused pid it will end up signaling the wrong process. This
> issue has often surfaced and there has been a push to address this problem 
> [1].
> 
> This patch uses file descriptors (fd) from proc/ as stable handles on
> struct pid. Even if a pid is recycled the handle will not change. The fd
> can be used to send signals to the process it refers to.
> Thus, the new syscall pidfd_send_signal() is introduced to solve this
> problem. Instead of pids it operates on process fds (pidfd).
> 
> /* prototype and argument /*
> long pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int 
> flags);
> 
> In addition to the pidfd and signal argument it takes an additional
> siginfo_t and flags argument. If the siginfo_t argument is NULL then
> pidfd_send_signal() is equivalent to kill(, ). If it
> is not NULL pidfd_send_signal() is equivalent to rt_sigqueueinfo().
> The flags argument is added to allow for future extensions of this syscall.
> It currently needs to be passed as 0. Failing to do so will cause EINVAL.
> 
> /* pidfd_send_signal() replaces multiple pid-based syscalls */
> The pidfd_send_signal() syscall currently takes on the job of
> rt_sigqueueinfo(2) and parts of the functionality of kill(2), Namely, when a
> positive pid is passed to kill(2). It will however be possible to also
> replace tgkill(2) and rt_tgsigqueueinfo(2) if this syscall is extended.
> 
> /* sending signals to threads (tid) and process groups (pgid) */
> Specifically, the pidfd_send_signal() syscall does currently not operate on
> process groups or threads. This is left for future extensions.
> In order to extend the syscall to allow sending signal to threads and
> process groups appropriately named flags (e.g. PIDFD_TYPE_PGID, and
> PIDFD_TYPE_TID) should be added. This implies that the flags argument will
> determine what is signaled and not the file descriptor itself. Put in other
> words, grouping in this api is a property of the flags argument not a
> property of the file descriptor (cf. [13]).
> When appropriate extensions through the flags argument are added then
> pidfd_send_signal() can additionally replace the part of kill(2) which
> operates on process groups as well as the tgkill(2) and
> rt_tgsigqueueinfo(2) syscalls.
> How such an extension could be implemented has been very roughly sketched
> in [14], [15], and [16]. However, this should not be taken as a commitment
> to a particular implementation. There might be better ways to do it.
> Right now this is intentionally left out to keep this patchset as simple as
> possible (cf. [4]). For example, if a pidfd for a tid from
> /proc//task/ is passed EOPNOTSUPP will be returned to give
> userspace a way to detect when I add support for signaling to threads (cf. 
> [10]).
> 
> /* naming */
> The syscall had various names throughout iterations of this patchset:
> - procfd_signal()
> - procfd_send_signal()
> - taskfd_send_signal()
> In the last round of reviews it was pointed out that given that if the
> flags argument decides the scope of the signal instead of different types
> of fds it might make sense to either settle for "procfd_" or "pidfd_" as
> prefix. The community was willing to accept either (cf. [17] and [18]).
> Given that one developer expressed strong preference for the "pidfd_"
> prefix (cf. [13] and with other developers less opinionated about the name
> we should settle for "pidfd_" to avoid further bikeshedding.
> 
> The  "_send_signal" suffix was chosen to reflect the fact that the syscall
> takes on the job of multiple syscalls. It is therefore intentional that the
> name is not reminiscent of neither kill(2) nor rt_sigqueueinfo(2). Not the
> fomer because it might imply that pidfd_send_signal() is a replacement for
> kill(2), and not the latter because it is a hassle to remember the correct
> spelling - especially for non-native speakers - and because it is not
> descriptive enough of what the syscall actually does. The name
> "pidfd_send_signal" makes it very clear that its job is to send signals.
> 
> /* O_PATH file descriptors */
> pidfds opened as O_PATH fds cannot be used to send signals to a process
> (cf. [2]). Signaling processes through pidfds is the equivalent of writing
> to a file. Thus, this is not an operation that operates "purely at the file
> descriptor level" as required by the open(2) manpage.
> 
> /* zombies */
> Zombies can be signaled just as any other process. No special error will be
> reported since a zombie state is an unreliable state (cf. [3]). However,
> this can be added as an extension through the @flags argument if the need
> ever arises.
> 
> /* cross-namespace signals */
> The patch currently enforces that the signaler and signalee either are in
> the same pid namespace or that the 

Re: [PATCH resend] eventfd: make eventfd files distinguishable in /proc/$PID/fd

2018-12-11 Thread Serge E. Hallyn
On Mon, Dec 10, 2018 at 03:35:46AM +0900, Masatake YAMATO wrote:
> Finding endpoints of an IPC channel is one of essential task to
> understand how a user program works. Procfs and netlink socket provide
> enough hints to find endpoints for IPC channels like pipes, unix
> sockets, and pseudo terminals. However, there is no simple way to find
> endpoints for an eventfd file from userland. An inode number doesn't
> hint. Unlike pipe, all eventfd files shares one inode object.
> 
> To provide the way to find endpoints of an eventfd file, this patch
> adds eventfd identifiers to the output of 'ls -l /proc/$pid/fd' like:
> 
>   ...
>   lrwx--. 1 qemu qemu 64 May 20 04:49 93 -> 'anon_inode:[eventfd:130]'
>   lrwx--. 1 qemu qemu 64 May 20 04:49 94 -> 'anon_inode:[eventfd:131]'
>   ...
> 
> Here "130" and "131" are added as identifiers newly added.
> In the case that ida_simple_get returns an error, this change doesn't add
> an identifier; just use "[eventfd]" as before.
> 
> Signed-off-by: Masatake YAMATO 

I'm going to love this when I need it :)  Thanks.

Acked-by: Serge Hallyn 

> ---
>  fs/eventfd.c | 14 +-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index 08d3bd602f73..c18952948110 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -21,6 +21,11 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +
> +/* Worst case buffer size needed for holding an integer. */
> +#define ITOA_MAX_LEN 12
> +DEFINE_IDA(eventfd_ida);
>  
>  struct eventfd_ctx {
>   struct kref kref;
> @@ -35,6 +40,7 @@ struct eventfd_ctx {
>*/
>   __u64 count;
>   unsigned int flags;
> + int id;
>  };
>  
>  /**
> @@ -69,6 +75,8 @@ EXPORT_SYMBOL_GPL(eventfd_signal);
>  
>  static void eventfd_free_ctx(struct eventfd_ctx *ctx)
>  {
> + if (ctx->id >= 0)
> + ida_simple_remove(_ida, ctx->id);
>   kfree(ctx);
>  }
>  
> @@ -384,6 +392,7 @@ static int do_eventfd(unsigned int count, int flags)
>  {
>   struct eventfd_ctx *ctx;
>   int fd;
> + char name[1 + 8 + ITOA_MAX_LEN + 1 + 1] = "[eventfd]";
>  
>   /* Check the EFD_* constants for consistency.  */
>   BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
> @@ -400,8 +409,11 @@ static int do_eventfd(unsigned int count, int flags)
>   init_waitqueue_head(>wqh);
>   ctx->count = count;
>   ctx->flags = flags;
> + ctx->id = ida_simple_get(_ida, 0, 0, GFP_KERNEL);
>  
> - fd = anon_inode_getfd("[eventfd]", _fops, ctx,
> + if (ctx->id >= 0)
> + snprintf(name, sizeof(name), "[eventfd:%d]", ctx->id);
> + fd = anon_inode_getfd(name, _fops, ctx,
> O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
>   if (fd < 0)
>   eventfd_free_ctx(ctx);
> -- 
> 2.17.0


Re: [PATCH v10 4/4] samples: add an example of seccomp user trap

2018-12-11 Thread Serge E. Hallyn
On Sun, Dec 09, 2018 at 11:24:14AM -0700, Tycho Andersen wrote:
> The idea here is just to give a demonstration of how one could safely use
> the SECCOMP_RET_USER_NOTIF feature to do mount policies. This particular
> policy is (as noted in the comment) not very interesting, but it serves to
> illustrate how one might apply a policy dodging the various TOCTOU issues.
> 
> Signed-off-by: Tycho Andersen 
> CC: Kees Cook 
> CC: Andy Lutomirski 
> CC: Oleg Nesterov 
> CC: Eric W. Biederman 
> CC: "Serge E. Hallyn" 
> CC: Christian Brauner 
> CC: Tyler Hicks 
> CC: Akihiro Suda 
> ---
> v5: new in v5
> v7: updates for v7 API changes
> v8: * add some more comments about what's happening in main() (Kees)
> * move from ptrace API to SECCOMP_FILTER_FLAG_NEW_LISTENER
> v9: * s/mknod/mount in error message
> * switch to the SECCOMP_GET_NOTIF_SIZES API
> * add a note about getting ENOENT from SECCOMP_IOCTL_NOTIF_SEND
> ---
>  samples/seccomp/.gitignore  |   1 +
>  samples/seccomp/Makefile|   7 +-
>  samples/seccomp/user-trap.c | 375 
>  3 files changed, 382 insertions(+), 1 deletion(-)
> 
> diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore
> index 78fb78184291..d1e2e817d556 100644
> --- a/samples/seccomp/.gitignore
> +++ b/samples/seccomp/.gitignore
> @@ -1,3 +1,4 @@
>  bpf-direct
>  bpf-fancy
>  dropper
> +user-trap
> diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
> index cf34ff6b4065..4920903c8009 100644
> --- a/samples/seccomp/Makefile
> +++ b/samples/seccomp/Makefile
> @@ -1,6 +1,6 @@
>  # SPDX-License-Identifier: GPL-2.0
>  ifndef CROSS_COMPILE
> -hostprogs-$(CONFIG_SAMPLE_SECCOMP) := bpf-fancy dropper bpf-direct
> +hostprogs-$(CONFIG_SAMPLE_SECCOMP) := bpf-fancy dropper bpf-direct user-trap
>  
>  HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include
>  HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include
> @@ -16,6 +16,10 @@ HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include
>  HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include
>  bpf-direct-objs := bpf-direct.o
>  
> +HOSTCFLAGS_user-trap.o += -I$(objtree)/usr/include
> +HOSTCFLAGS_user-trap.o += -idirafter $(objtree)/include
> +user-trap-objs := user-trap.o
> +
>  # Try to match the kernel target.
>  ifndef CONFIG_64BIT
>  
> @@ -33,6 +37,7 @@ HOSTCFLAGS_bpf-fancy.o += $(MFLAG)
>  HOSTLDLIBS_bpf-direct += $(MFLAG)
>  HOSTLDLIBS_bpf-fancy += $(MFLAG)
>  HOSTLDLIBS_dropper += $(MFLAG)
> +HOSTLDLIBS_user-trap += $(MFLAG)
>  endif
>  always := $(hostprogs-m)
>  endif
> diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
> new file mode 100644
> index ..61267cb59c8e
> --- /dev/null
> +++ b/samples/seccomp/user-trap.c
> @@ -0,0 +1,375 @@
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
> +
> +static int seccomp(unsigned int op, unsigned int flags, void *args)
> +{
> + errno = 0;
> + return syscall(__NR_seccomp, op, flags, args);
> +}
> +
> +static int send_fd(int sock, int fd)
> +{
> + struct msghdr msg = {};
> + struct cmsghdr *cmsg;
> + char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
> + struct iovec io = {
> + .iov_base = ,
> + .iov_len = 1,
> + };
> +
> + msg.msg_iov = 
> + msg.msg_iovlen = 1;
> + msg.msg_control = buf;
> + msg.msg_controllen = sizeof(buf);
> + cmsg = CMSG_FIRSTHDR();
> + cmsg->cmsg_level = SOL_SOCKET;
> + cmsg->cmsg_type = SCM_RIGHTS;
> + cmsg->cmsg_len = CMSG_LEN(sizeof(int));
> + *((int *)CMSG_DATA(cmsg)) = fd;
> + msg.msg_controllen = cmsg->cmsg_len;
> +
> + if (sendmsg(sock, , 0) < 0) {
> + perror("sendmsg");
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +static int recv_fd(int sock)
> +{
> + struct msghdr msg = {};
> + struct cmsghdr *cmsg;
> + char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
> + struct iovec io = {
> + .iov_base = ,
> + .iov_len = 1,
> + };
> +
> + msg.msg_iov = 
> + msg.msg_iovlen = 1;
> + msg.msg_control = buf;
> + msg.msg_controllen = sizeof(buf);
> +
> + if (recvmsg(sock, , 0) < 0) {
> + perror("recvmsg&quo

Re: [PATCH v4] signal: add taskfd_send_signal() syscall

2018-12-07 Thread Serge E. Hallyn
On Fri, Dec 07, 2018 at 02:54:25AM +0100, Christian Brauner wrote:
> On Thu, Dec 06, 2018 at 05:39:18PM -0800, Daniel Colascione wrote:
> > On Thu, Dec 6, 2018 at 4:59 PM Serge E. Hallyn  wrote:
> > >
> > > On Thu, Dec 06, 2018 at 04:34:54PM -0800, Daniel Colascione wrote:
> > > > On Thu, Dec 6, 2018 at 4:31 PM Serge E. Hallyn  wrote:
> > > > >
> > > > > On Fri, Dec 07, 2018 at 12:17:45AM +0100, Christian Brauner wrote:
> > > > > > On Thu, Dec 06, 2018 at 11:39:48PM +0100, Christian Brauner wrote:
> > > > > > > On Thu, Dec 06, 2018 at 03:46:53PM -0600, Eric W. Biederman wrote:
> > > > > > > > Christian Brauner  writes:
> > > > > > > >
> > > > > > > > >> Your intention is to add the thread case to support pthreads 
> > > > > > > > >> once the
> > > > > > > > >> process case is sorted out.  So this is something that needs 
> > > > > > > > >> to be made
> > > > > > > > >> clear.  Did I miss how you plan to handle threads?
> > > > > > > > >
> > > > > > > > > Yeah, maybe you missed it in the commit message [2] which is 
> > > > > > > > > based on a
> > > > > > > > > discussion with Andy [3] and Arnd [4]:
> > > > > > > >
> > > > > > > > Looking at your references I haven't missed it.  You are not 
> > > > > > > > deciding
> > > > > > > > anything as of yet to keep it simple.  Except you are returning
> > > > > > > > EOPNOTSUPP.  You are very much intending to do something.
> > > > > > >
> > > > > > > That was clear all along and was pointed at every occassion in the
> > > > > > > threads. I even went through the hazzle to give you all of the
> > > > > > > references when there's lore.kernel.org.
> > > > > > >
> > > > > > > >
> > > > > > > > Decide.  Do you use the flags parameter or is the width of the
> > > > > > > > target depending on the flags.
> > > > > >
> > > > > > Ok, let's try to be constructive. I understand the general concern 
> > > > > > for
> > > > > > the future so let's put a contract into the commit message stating 
> > > > > > that
> > > > > > the width of the target aka *what is signaled* will be based on a 
> > > > > > flag
> > > > > > parameter if we ever extend it:
> > > > > >
> > > > > > taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PGID);
> > > > > > taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_TID);
> > > > > >
> > > > > > with the current default being
> > > > > >
> > > > > > taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PID);
> > > > > >
> > > > > > This seems to me the cleanest solution as we only use one type of 
> > > > > > file
> > > > > > descriptor. Can everyone be on board with this? If so I'm going to 
> > > > > > send
> > > > > > out a new version of the patch.
> > > > > >
> > > > > > Christian
> > > > >
> > > > > I'm on board with this, but I think you need to also clarify what 
> > > > > exactly
> > > > > the fd stands for.  I think that (a) userspace should not have to care
> > > > > about the struct pid implementation, and so (b) the procfd should 
> > > > > stand
> > > > > for all the pids.  So when taskfd_send_signal(fd, SIGSTOP, NULL, 
> > > > > TASKFD_PGID)
> > > > > becomes implemented, then open(/proc/5) will pin all three pids, as 
> > > > > will
> > > > > open(/proc/5/task/6).
> > > >
> > > > This change doesn't "pin" any PID, and it makes no sense to make a
> > > > process FD stand for all its threads. What does that even mean?
> > >
> > > Currently the patch relies on the procfd inode saving a copy to the 
> > > PIDTYPE_PID
> > > pid.
> > 
> > struct pid doesn't have a type field. The interpretation depends on
> > the caller's use of the struct pid, and in the current path, that's
> > PID

Re: [PATCH v4] signal: add taskfd_send_signal() syscall

2018-12-07 Thread Serge E. Hallyn
On Fri, Dec 07, 2018 at 02:54:25AM +0100, Christian Brauner wrote:
> On Thu, Dec 06, 2018 at 05:39:18PM -0800, Daniel Colascione wrote:
> > On Thu, Dec 6, 2018 at 4:59 PM Serge E. Hallyn  wrote:
> > >
> > > On Thu, Dec 06, 2018 at 04:34:54PM -0800, Daniel Colascione wrote:
> > > > On Thu, Dec 6, 2018 at 4:31 PM Serge E. Hallyn  wrote:
> > > > >
> > > > > On Fri, Dec 07, 2018 at 12:17:45AM +0100, Christian Brauner wrote:
> > > > > > On Thu, Dec 06, 2018 at 11:39:48PM +0100, Christian Brauner wrote:
> > > > > > > On Thu, Dec 06, 2018 at 03:46:53PM -0600, Eric W. Biederman wrote:
> > > > > > > > Christian Brauner  writes:
> > > > > > > >
> > > > > > > > >> Your intention is to add the thread case to support pthreads 
> > > > > > > > >> once the
> > > > > > > > >> process case is sorted out.  So this is something that needs 
> > > > > > > > >> to be made
> > > > > > > > >> clear.  Did I miss how you plan to handle threads?
> > > > > > > > >
> > > > > > > > > Yeah, maybe you missed it in the commit message [2] which is 
> > > > > > > > > based on a
> > > > > > > > > discussion with Andy [3] and Arnd [4]:
> > > > > > > >
> > > > > > > > Looking at your references I haven't missed it.  You are not 
> > > > > > > > deciding
> > > > > > > > anything as of yet to keep it simple.  Except you are returning
> > > > > > > > EOPNOTSUPP.  You are very much intending to do something.
> > > > > > >
> > > > > > > That was clear all along and was pointed at every occassion in the
> > > > > > > threads. I even went through the hazzle to give you all of the
> > > > > > > references when there's lore.kernel.org.
> > > > > > >
> > > > > > > >
> > > > > > > > Decide.  Do you use the flags parameter or is the width of the
> > > > > > > > target depending on the flags.
> > > > > >
> > > > > > Ok, let's try to be constructive. I understand the general concern 
> > > > > > for
> > > > > > the future so let's put a contract into the commit message stating 
> > > > > > that
> > > > > > the width of the target aka *what is signaled* will be based on a 
> > > > > > flag
> > > > > > parameter if we ever extend it:
> > > > > >
> > > > > > taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PGID);
> > > > > > taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_TID);
> > > > > >
> > > > > > with the current default being
> > > > > >
> > > > > > taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PID);
> > > > > >
> > > > > > This seems to me the cleanest solution as we only use one type of 
> > > > > > file
> > > > > > descriptor. Can everyone be on board with this? If so I'm going to 
> > > > > > send
> > > > > > out a new version of the patch.
> > > > > >
> > > > > > Christian
> > > > >
> > > > > I'm on board with this, but I think you need to also clarify what 
> > > > > exactly
> > > > > the fd stands for.  I think that (a) userspace should not have to care
> > > > > about the struct pid implementation, and so (b) the procfd should 
> > > > > stand
> > > > > for all the pids.  So when taskfd_send_signal(fd, SIGSTOP, NULL, 
> > > > > TASKFD_PGID)
> > > > > becomes implemented, then open(/proc/5) will pin all three pids, as 
> > > > > will
> > > > > open(/proc/5/task/6).
> > > >
> > > > This change doesn't "pin" any PID, and it makes no sense to make a
> > > > process FD stand for all its threads. What does that even mean?
> > >
> > > Currently the patch relies on the procfd inode saving a copy to the 
> > > PIDTYPE_PID
> > > pid.
> > 
> > struct pid doesn't have a type field. The interpretation depends on
> > the caller's use of the struct pid, and in the current path, that's
> > PID

Re: [PATCH v4] signal: add taskfd_send_signal() syscall

2018-12-07 Thread Serge E. Hallyn
On Thu, Dec 06, 2018 at 01:18:58PM +0100, Christian Brauner wrote:
> The kill() syscall operates on process identifiers (pid). After a process
> has exited its pid can be reused by another process. If a caller sends a
> signal to a reused pid it will end up signaling the wrong process. This
> issue has often surfaced and there has been a push [1] to address this
> problem.
> 
> This patch uses file descriptors (fd) from proc/ as stable handles on
> struct pid. Even if a pid is recycled the handle will not change. The fd
> can be used to send signals to the process it refers to.
> Thus, the new syscall taskfd_send_signal() is introduced to solve this
> problem. Instead of pids it operates on process fds (taskfd).
> 
> /* prototype and argument /*
> long taskfd_send_signal(int taskfd, int sig, siginfo_t *info, unsigned int 
> flags);
> 
> In addition to the taskfd and signal argument it takes an additional
> siginfo_t and flags argument. If the siginfo_t argument is NULL then
> taskfd_send_signal() behaves like kill(). If it is not NULL
> taskfd_send_signal() behaves like rt_sigqueueinfo().
> The flags argument is added to allow for future extensions of this syscall.
> It currently needs to be passed as 0. Failing to do so will cause EINVAL.
> 
> /* taskfd_send_signal() replaces multiple pid-based syscalls */
> The taskfd_send_signal() syscall currently takes on the job of the
> following syscalls that operate on pids:
> - kill(2)
> - rt_sigqueueinfo(2)
> The syscall is defined in such a way that it can also operate on thread fds
> instead of process fds. In a future patchset I will extend it to operate on
> taskfds from /proc//task/ at which point it will additionally
> take on the job of:
> - tgkill(2)
> - rt_tgsigqueueinfo(2)
> Right now this is intentionally left out to keep this patchset as simple as
> possible (cf. [4]). If a taskfd of /proc//task/ is passed
> EOPNOTSUPP will be returned to give userspace a way to detect when I add
> support for such taskfds (cf. [10]).
> 
> /* naming */
> The original prefix of the syscall was "procfd_". However, it has been
> pointed out that since this syscall will eventually operate on both
> processes and threads the name should reflect this (cf. [12]). The best
> possible candidate even from a userspace perspective seems to be "task".
> Although "task" is used internally we are alreday deviating from POSIX by
> using file descriptors to processes in the first place so it seems fine to
> use the "taskfd_" prefix.
> 
> The name taskfd_send_signal() was also chosen to reflect the fact that it
> takes on the job of multiple syscalls. It is intentional that the name is
> not reminiscent of neither kill(2) nor rt_sigqueueinfo(2). Not the fomer
> because it might imply that taskfd_send_signal() is only a replacement for
> kill(2) and not the latter because it is a hazzle to remember the correct
> spelling (especially for non-native speakers) and because it is not
> descriptive enough of what the syscall actually does. The name
> "taskfd_send_signal" makes it very clear that its job is to send signals.
> 
> /* O_PATH file descriptors */
> taskfds opened as O_PATH fds cannot be used to send signals to a process
> (cf. [2]). Signaling processes through taskfds is the equivalent of writing
> to a file. Thus, this is not an operation that operates "purely at the
> file descriptor level" as required by the open(2) manpage.
> 
> /* zombies */
> Zombies can be signaled just as any other process. No special error will be
> reported since a zombie state is an unreliable state (cf. [3]).
> 
> /* cross-namespace signals */
> The patch currently enforces that the signaler and signalee either are in
> the same pid namespace or that the signaler's pid namespace is an ancestor
> of the signalee's pid namespace. This is done for the sake of simplicity
> and because it is unclear to what values certain members of struct
> siginfo_t would need to be set to (cf. [5], [6]).
> 
> /* compat syscalls */
> It became clear that we would like to avoid adding compat syscalls (cf.
> [7]). The compat syscall handling is now done in kernel/signal.c itself by
> adding __copy_siginfo_from_user_generic() which lets us avoid compat
> syscalls (cf. [8]). It should be noted that the addition of
> __copy_siginfo_from_user_any() is caused by a bug in the original
> implementation of rt_sigqueueinfo(2) (cf. 12).
> With upcoming rework for syscall handling things might improve
> significantly (cf. [11]) and __copy_siginfo_from_user_any() will not gain
> any additional callers.
> 
> /* testing */
> This patch was tested on x64 and x86.
> 
> /* userspace usage */
> An asciinema recording for the basic functionality can be found under [9].
> With this patch a process can be killed via:
> 
>  #define _GNU_SOURCE
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
> 
>  static inline int do_taskfd_send_signal(int taskfd, int sig, 

Re: [PATCH v4] signal: add taskfd_send_signal() syscall

2018-12-07 Thread Serge E. Hallyn
On Thu, Dec 06, 2018 at 01:18:58PM +0100, Christian Brauner wrote:
> The kill() syscall operates on process identifiers (pid). After a process
> has exited its pid can be reused by another process. If a caller sends a
> signal to a reused pid it will end up signaling the wrong process. This
> issue has often surfaced and there has been a push [1] to address this
> problem.
> 
> This patch uses file descriptors (fd) from proc/ as stable handles on
> struct pid. Even if a pid is recycled the handle will not change. The fd
> can be used to send signals to the process it refers to.
> Thus, the new syscall taskfd_send_signal() is introduced to solve this
> problem. Instead of pids it operates on process fds (taskfd).
> 
> /* prototype and argument /*
> long taskfd_send_signal(int taskfd, int sig, siginfo_t *info, unsigned int 
> flags);
> 
> In addition to the taskfd and signal argument it takes an additional
> siginfo_t and flags argument. If the siginfo_t argument is NULL then
> taskfd_send_signal() behaves like kill(). If it is not NULL
> taskfd_send_signal() behaves like rt_sigqueueinfo().
> The flags argument is added to allow for future extensions of this syscall.
> It currently needs to be passed as 0. Failing to do so will cause EINVAL.
> 
> /* taskfd_send_signal() replaces multiple pid-based syscalls */
> The taskfd_send_signal() syscall currently takes on the job of the
> following syscalls that operate on pids:
> - kill(2)
> - rt_sigqueueinfo(2)
> The syscall is defined in such a way that it can also operate on thread fds
> instead of process fds. In a future patchset I will extend it to operate on
> taskfds from /proc//task/ at which point it will additionally
> take on the job of:
> - tgkill(2)
> - rt_tgsigqueueinfo(2)
> Right now this is intentionally left out to keep this patchset as simple as
> possible (cf. [4]). If a taskfd of /proc//task/ is passed
> EOPNOTSUPP will be returned to give userspace a way to detect when I add
> support for such taskfds (cf. [10]).
> 
> /* naming */
> The original prefix of the syscall was "procfd_". However, it has been
> pointed out that since this syscall will eventually operate on both
> processes and threads the name should reflect this (cf. [12]). The best
> possible candidate even from a userspace perspective seems to be "task".
> Although "task" is used internally we are alreday deviating from POSIX by
> using file descriptors to processes in the first place so it seems fine to
> use the "taskfd_" prefix.
> 
> The name taskfd_send_signal() was also chosen to reflect the fact that it
> takes on the job of multiple syscalls. It is intentional that the name is
> not reminiscent of neither kill(2) nor rt_sigqueueinfo(2). Not the fomer
> because it might imply that taskfd_send_signal() is only a replacement for
> kill(2) and not the latter because it is a hazzle to remember the correct
> spelling (especially for non-native speakers) and because it is not
> descriptive enough of what the syscall actually does. The name
> "taskfd_send_signal" makes it very clear that its job is to send signals.
> 
> /* O_PATH file descriptors */
> taskfds opened as O_PATH fds cannot be used to send signals to a process
> (cf. [2]). Signaling processes through taskfds is the equivalent of writing
> to a file. Thus, this is not an operation that operates "purely at the
> file descriptor level" as required by the open(2) manpage.
> 
> /* zombies */
> Zombies can be signaled just as any other process. No special error will be
> reported since a zombie state is an unreliable state (cf. [3]).
> 
> /* cross-namespace signals */
> The patch currently enforces that the signaler and signalee either are in
> the same pid namespace or that the signaler's pid namespace is an ancestor
> of the signalee's pid namespace. This is done for the sake of simplicity
> and because it is unclear to what values certain members of struct
> siginfo_t would need to be set to (cf. [5], [6]).
> 
> /* compat syscalls */
> It became clear that we would like to avoid adding compat syscalls (cf.
> [7]). The compat syscall handling is now done in kernel/signal.c itself by
> adding __copy_siginfo_from_user_generic() which lets us avoid compat
> syscalls (cf. [8]). It should be noted that the addition of
> __copy_siginfo_from_user_any() is caused by a bug in the original
> implementation of rt_sigqueueinfo(2) (cf. 12).
> With upcoming rework for syscall handling things might improve
> significantly (cf. [11]) and __copy_siginfo_from_user_any() will not gain
> any additional callers.
> 
> /* testing */
> This patch was tested on x64 and x86.
> 
> /* userspace usage */
> An asciinema recording for the basic functionality can be found under [9].
> With this patch a process can be killed via:
> 
>  #define _GNU_SOURCE
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
> 
>  static inline int do_taskfd_send_signal(int taskfd, int sig, 

Re: [PATCH v4] signal: add taskfd_send_signal() syscall

2018-12-06 Thread Serge E. Hallyn
On Thu, Dec 06, 2018 at 04:34:54PM -0800, Daniel Colascione wrote:
> On Thu, Dec 6, 2018 at 4:31 PM Serge E. Hallyn  wrote:
> >
> > On Fri, Dec 07, 2018 at 12:17:45AM +0100, Christian Brauner wrote:
> > > On Thu, Dec 06, 2018 at 11:39:48PM +0100, Christian Brauner wrote:
> > > > On Thu, Dec 06, 2018 at 03:46:53PM -0600, Eric W. Biederman wrote:
> > > > > Christian Brauner  writes:
> > > > >
> > > > > >> Your intention is to add the thread case to support pthreads once 
> > > > > >> the
> > > > > >> process case is sorted out.  So this is something that needs to be 
> > > > > >> made
> > > > > >> clear.  Did I miss how you plan to handle threads?
> > > > > >
> > > > > > Yeah, maybe you missed it in the commit message [2] which is based 
> > > > > > on a
> > > > > > discussion with Andy [3] and Arnd [4]:
> > > > >
> > > > > Looking at your references I haven't missed it.  You are not deciding
> > > > > anything as of yet to keep it simple.  Except you are returning
> > > > > EOPNOTSUPP.  You are very much intending to do something.
> > > >
> > > > That was clear all along and was pointed at every occassion in the
> > > > threads. I even went through the hazzle to give you all of the
> > > > references when there's lore.kernel.org.
> > > >
> > > > >
> > > > > Decide.  Do you use the flags parameter or is the width of the
> > > > > target depending on the flags.
> > >
> > > Ok, let's try to be constructive. I understand the general concern for
> > > the future so let's put a contract into the commit message stating that
> > > the width of the target aka *what is signaled* will be based on a flag
> > > parameter if we ever extend it:
> > >
> > > taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PGID);
> > > taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_TID);
> > >
> > > with the current default being
> > >
> > > taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PID);
> > >
> > > This seems to me the cleanest solution as we only use one type of file
> > > descriptor. Can everyone be on board with this? If so I'm going to send
> > > out a new version of the patch.
> > >
> > > Christian
> >
> > I'm on board with this, but I think you need to also clarify what exactly
> > the fd stands for.  I think that (a) userspace should not have to care
> > about the struct pid implementation, and so (b) the procfd should stand
> > for all the pids.  So when taskfd_send_signal(fd, SIGSTOP, NULL, 
> > TASKFD_PGID)
> > becomes implemented, then open(/proc/5) will pin all three pids, as will
> > open(/proc/5/task/6).
> 
> This change doesn't "pin" any PID, and it makes no sense to make a
> process FD stand for all its threads. What does that even mean?

Currently the patch relies on the procfd inode saving a copy to the PIDTYPE_PID
pid.  I'm not sure offhand, can it go to the PIDTYPE_PGID from that after the
task has died, or not?   I didn't think so.  If it can then great.

The point is (a) these are details which should not have to bother userspace,
and (b) how to decide who we're sending the signal to (tid/pid/pgid) should
be specified in precisely one way.  So either a flag, or comign from the type
of fd that was opened.

-serge


Re: [PATCH v4] signal: add taskfd_send_signal() syscall

2018-12-06 Thread Serge E. Hallyn
On Thu, Dec 06, 2018 at 04:34:54PM -0800, Daniel Colascione wrote:
> On Thu, Dec 6, 2018 at 4:31 PM Serge E. Hallyn  wrote:
> >
> > On Fri, Dec 07, 2018 at 12:17:45AM +0100, Christian Brauner wrote:
> > > On Thu, Dec 06, 2018 at 11:39:48PM +0100, Christian Brauner wrote:
> > > > On Thu, Dec 06, 2018 at 03:46:53PM -0600, Eric W. Biederman wrote:
> > > > > Christian Brauner  writes:
> > > > >
> > > > > >> Your intention is to add the thread case to support pthreads once 
> > > > > >> the
> > > > > >> process case is sorted out.  So this is something that needs to be 
> > > > > >> made
> > > > > >> clear.  Did I miss how you plan to handle threads?
> > > > > >
> > > > > > Yeah, maybe you missed it in the commit message [2] which is based 
> > > > > > on a
> > > > > > discussion with Andy [3] and Arnd [4]:
> > > > >
> > > > > Looking at your references I haven't missed it.  You are not deciding
> > > > > anything as of yet to keep it simple.  Except you are returning
> > > > > EOPNOTSUPP.  You are very much intending to do something.
> > > >
> > > > That was clear all along and was pointed at every occassion in the
> > > > threads. I even went through the hazzle to give you all of the
> > > > references when there's lore.kernel.org.
> > > >
> > > > >
> > > > > Decide.  Do you use the flags parameter or is the width of the
> > > > > target depending on the flags.
> > >
> > > Ok, let's try to be constructive. I understand the general concern for
> > > the future so let's put a contract into the commit message stating that
> > > the width of the target aka *what is signaled* will be based on a flag
> > > parameter if we ever extend it:
> > >
> > > taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PGID);
> > > taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_TID);
> > >
> > > with the current default being
> > >
> > > taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PID);
> > >
> > > This seems to me the cleanest solution as we only use one type of file
> > > descriptor. Can everyone be on board with this? If so I'm going to send
> > > out a new version of the patch.
> > >
> > > Christian
> >
> > I'm on board with this, but I think you need to also clarify what exactly
> > the fd stands for.  I think that (a) userspace should not have to care
> > about the struct pid implementation, and so (b) the procfd should stand
> > for all the pids.  So when taskfd_send_signal(fd, SIGSTOP, NULL, 
> > TASKFD_PGID)
> > becomes implemented, then open(/proc/5) will pin all three pids, as will
> > open(/proc/5/task/6).
> 
> This change doesn't "pin" any PID, and it makes no sense to make a
> process FD stand for all its threads. What does that even mean?

Currently the patch relies on the procfd inode saving a copy to the PIDTYPE_PID
pid.  I'm not sure offhand, can it go to the PIDTYPE_PGID from that after the
task has died, or not?   I didn't think so.  If it can then great.

The point is (a) these are details which should not have to bother userspace,
and (b) how to decide who we're sending the signal to (tid/pid/pgid) should
be specified in precisely one way.  So either a flag, or comign from the type
of fd that was opened.

-serge


Re: [PATCH v4] signal: add taskfd_send_signal() syscall

2018-12-06 Thread Serge E. Hallyn
On Fri, Dec 07, 2018 at 12:17:45AM +0100, Christian Brauner wrote:
> On Thu, Dec 06, 2018 at 11:39:48PM +0100, Christian Brauner wrote:
> > On Thu, Dec 06, 2018 at 03:46:53PM -0600, Eric W. Biederman wrote:
> > > Christian Brauner  writes:
> > > 
> > > >> Your intention is to add the thread case to support pthreads once the
> > > >> process case is sorted out.  So this is something that needs to be made
> > > >> clear.  Did I miss how you plan to handle threads?
> > > >
> > > > Yeah, maybe you missed it in the commit message [2] which is based on a
> > > > discussion with Andy [3] and Arnd [4]:
> > > 
> > > Looking at your references I haven't missed it.  You are not deciding
> > > anything as of yet to keep it simple.  Except you are returning
> > > EOPNOTSUPP.  You are very much intending to do something.
> > 
> > That was clear all along and was pointed at every occassion in the
> > threads. I even went through the hazzle to give you all of the
> > references when there's lore.kernel.org.
> > 
> > > 
> > > Decide.  Do you use the flags parameter or is the width of the
> > > target depending on the flags.
> 
> Ok, let's try to be constructive. I understand the general concern for
> the future so let's put a contract into the commit message stating that
> the width of the target aka *what is signaled* will be based on a flag
> parameter if we ever extend it:
> 
> taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PGID);
> taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_TID);
> 
> with the current default being
> 
> taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PID);
> 
> This seems to me the cleanest solution as we only use one type of file
> descriptor. Can everyone be on board with this? If so I'm going to send
> out a new version of the patch.
> 
> Christian

I'm on board with this, but I think you need to also clarify what exactly
the fd stands for.  I think that (a) userspace should not have to care
about the struct pid implementation, and so (b) the procfd should stand
for all the pids.  So when taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PGID)
becomes implemented, then open(/proc/5) will pin all three pids, as will
open(/proc/5/task/6).

-serge


Re: [PATCH v4] signal: add taskfd_send_signal() syscall

2018-12-06 Thread Serge E. Hallyn
On Fri, Dec 07, 2018 at 12:17:45AM +0100, Christian Brauner wrote:
> On Thu, Dec 06, 2018 at 11:39:48PM +0100, Christian Brauner wrote:
> > On Thu, Dec 06, 2018 at 03:46:53PM -0600, Eric W. Biederman wrote:
> > > Christian Brauner  writes:
> > > 
> > > >> Your intention is to add the thread case to support pthreads once the
> > > >> process case is sorted out.  So this is something that needs to be made
> > > >> clear.  Did I miss how you plan to handle threads?
> > > >
> > > > Yeah, maybe you missed it in the commit message [2] which is based on a
> > > > discussion with Andy [3] and Arnd [4]:
> > > 
> > > Looking at your references I haven't missed it.  You are not deciding
> > > anything as of yet to keep it simple.  Except you are returning
> > > EOPNOTSUPP.  You are very much intending to do something.
> > 
> > That was clear all along and was pointed at every occassion in the
> > threads. I even went through the hazzle to give you all of the
> > references when there's lore.kernel.org.
> > 
> > > 
> > > Decide.  Do you use the flags parameter or is the width of the
> > > target depending on the flags.
> 
> Ok, let's try to be constructive. I understand the general concern for
> the future so let's put a contract into the commit message stating that
> the width of the target aka *what is signaled* will be based on a flag
> parameter if we ever extend it:
> 
> taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PGID);
> taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_TID);
> 
> with the current default being
> 
> taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PID);
> 
> This seems to me the cleanest solution as we only use one type of file
> descriptor. Can everyone be on board with this? If so I'm going to send
> out a new version of the patch.
> 
> Christian

I'm on board with this, but I think you need to also clarify what exactly
the fd stands for.  I think that (a) userspace should not have to care
about the struct pid implementation, and so (b) the procfd should stand
for all the pids.  So when taskfd_send_signal(fd, SIGSTOP, NULL, TASKFD_PGID)
becomes implemented, then open(/proc/5) will pin all three pids, as will
open(/proc/5/task/6).

-serge


Re: [PATCH 7/7] ima: Support platform keyring for kernel appraisal

2018-12-06 Thread Serge E. Hallyn
On Sun, Nov 25, 2018 at 08:45:00PM +0530, Nayna Jain wrote:
> On secure boot enabled systems, the bootloader verifies the kernel
> image and possibly the initramfs signatures based on a set of keys. A
> soft reboot(kexec) of the system, with the same kernel image and
> initramfs, requires access to the original keys to verify the
> signatures.
> 
> This patch allows IMA-appraisal access to those original keys, now
> loaded on the platform keyring, needed for verifying the kernel image
> and initramfs signatures.
> 
> Signed-off-by: Nayna Jain 
> Reviewed-by: Mimi Zohar 

The overall set seems sensible to me, and I see no errors here,

Acked-by: Serge Hallyn 

I do think that replacing the 'rc' with xattr_len in the previous line might
help future readers save a few cycles.

> ---
>  security/integrity/ima/ima_appraise.c | 11 ++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/security/integrity/ima/ima_appraise.c 
> b/security/integrity/ima/ima_appraise.c
> index deec1804a00a..9c13585e7d3e 100644
> --- a/security/integrity/ima/ima_appraise.c
> +++ b/security/integrity/ima/ima_appraise.c
> @@ -294,7 +294,16 @@ int ima_appraise_measurement(enum ima_hooks func,
>iint->ima_hash->length);
>   if (rc == -EOPNOTSUPP) {
>   status = INTEGRITY_UNKNOWN;
> - } else if (rc) {
> + break;
> + }
> + if (rc && func == KEXEC_KERNEL_CHECK)
> + rc = integrity_digsig_verify(
> + INTEGRITY_KEYRING_PLATFORM,
> + (const char *)xattr_value,
> + xattr_len,
> + iint->ima_hash->digest,
> + iint->ima_hash->length);
> + if (rc) {
>   cause = "invalid-signature";
>   status = INTEGRITY_FAIL;
>   } else {
> -- 
> 2.13.6
> 


Re: [PATCH v4] signal: add taskfd_send_signal() syscall

2018-12-06 Thread Serge E. Hallyn
On Thu, Dec 06, 2018 at 10:30:40AM -0800, Kees Cook wrote:
> On Thu, Dec 6, 2018 at 9:41 AM Christian Brauner  wrote:
> > I feel changing the name around by a single persons preferences is not
> > really a nice thing to do community-wise. So I'd like to hear other
> > people chime in first before I make that change.
> 
> I don't think the name is hugely critical (but it's always the hardest
> to settle on). My preference order would be:
> 
> taskfd_send_signal()
> pidfd_send_signal()
> procfd_send_signal()
> fd_send_signal()

imo, either procfd_send_signal() or taskfd_send_signal()

It seems to me that using flags later to specify sending to pgrp vs thread
is fine:  it's specifying how to interpret the 'fd' in 'procfd_send_signal()'.

> But, agreed, I think fdkill() should not be used.
> 
> -- 
> Kees Cook


Re: [PATCH v4] signal: add taskfd_send_signal() syscall

2018-12-06 Thread Serge E. Hallyn
On Thu, Dec 06, 2018 at 10:30:40AM -0800, Kees Cook wrote:
> On Thu, Dec 6, 2018 at 9:41 AM Christian Brauner  wrote:
> > I feel changing the name around by a single persons preferences is not
> > really a nice thing to do community-wise. So I'd like to hear other
> > people chime in first before I make that change.
> 
> I don't think the name is hugely critical (but it's always the hardest
> to settle on). My preference order would be:
> 
> taskfd_send_signal()
> pidfd_send_signal()
> procfd_send_signal()
> fd_send_signal()

imo, either procfd_send_signal() or taskfd_send_signal()

It seems to me that using flags later to specify sending to pgrp vs thread
is fine:  it's specifying how to interpret the 'fd' in 'procfd_send_signal()'.

> But, agreed, I think fdkill() should not be used.
> 
> -- 
> Kees Cook


Re: [PATCH v9 3/4] seccomp: add a return code to trap to userspace

2018-12-03 Thread Serge E. Hallyn
On Mon, Dec 03, 2018 at 08:52:11AM -0700, Tycho Andersen wrote:
> On Sun, Dec 02, 2018 at 11:26:50PM -0600, Serge E. Hallyn wrote:
> > On Sun, Dec 02, 2018 at 08:28:26PM -0700, Tycho Andersen wrote:
> > > +struct seccomp_knotif {
> > > + /* The struct pid of the task whose filter triggered the notification */
> > > + struct task_struct *task;
> > > +
> > > + /* The "cookie" for this request; this is unique for this filter. */
> > > + u64 id;
> > > +
> > > + /*
> > > +  * The seccomp data. This pointer is valid the entire time this
> > > +  * notification is active, since it comes from __seccomp_filter which
> > 
> > define 'active' - is a notification in state REPLIED still active?
> 
> Yes,
> 
> > Actually while looking at that, I came to wondering - when are knotifs
> > freed?  Seems like only during seccomp_notify_release(), i.e. when the
> > tracing task stops polling for events?  Is that going to be a problem?
> > Or am I misreading this?
> 
> they're stack allocated in do_user_notification(). So "active" in this
> sense really means "somewhere in do_user_notification()".

D'oh!  I see, thanks :)

> > > + if (ret == 0 && copy_to_user(buf, , sizeof(unotif))) {
> > > + ret = -EFAULT;
> > > +
> > > + /*
> > > +  * Userspace screwed up. To make sure that we keep this
> > > +  * notification alive, let's reset it back to INIT. It
> > 
> > Is keeping the notification alive the right thing to do?
> > 
> > If userspace has messed up in something this touchy, it seems unlikely
> > that it'll to better if we give it a do-over...  I'm not sure whether
> > killing the whole thing (victim and tracer) is the right thing or not.
> 
> I suppose we could do that too. I just didn't want to get into a
> situation where the notification is lost and the task is stuck because
> userspace screwed up here.

Yeah, that's probably best - I'm just trying to see whether there is
any way that this could be abused.  My underactive imagination hasn't
come up with anything yet.

-serge


Re: [PATCH v9 3/4] seccomp: add a return code to trap to userspace

2018-12-03 Thread Serge E. Hallyn
On Mon, Dec 03, 2018 at 08:52:11AM -0700, Tycho Andersen wrote:
> On Sun, Dec 02, 2018 at 11:26:50PM -0600, Serge E. Hallyn wrote:
> > On Sun, Dec 02, 2018 at 08:28:26PM -0700, Tycho Andersen wrote:
> > > +struct seccomp_knotif {
> > > + /* The struct pid of the task whose filter triggered the notification */
> > > + struct task_struct *task;
> > > +
> > > + /* The "cookie" for this request; this is unique for this filter. */
> > > + u64 id;
> > > +
> > > + /*
> > > +  * The seccomp data. This pointer is valid the entire time this
> > > +  * notification is active, since it comes from __seccomp_filter which
> > 
> > define 'active' - is a notification in state REPLIED still active?
> 
> Yes,
> 
> > Actually while looking at that, I came to wondering - when are knotifs
> > freed?  Seems like only during seccomp_notify_release(), i.e. when the
> > tracing task stops polling for events?  Is that going to be a problem?
> > Or am I misreading this?
> 
> they're stack allocated in do_user_notification(). So "active" in this
> sense really means "somewhere in do_user_notification()".

D'oh!  I see, thanks :)

> > > + if (ret == 0 && copy_to_user(buf, , sizeof(unotif))) {
> > > + ret = -EFAULT;
> > > +
> > > + /*
> > > +  * Userspace screwed up. To make sure that we keep this
> > > +  * notification alive, let's reset it back to INIT. It
> > 
> > Is keeping the notification alive the right thing to do?
> > 
> > If userspace has messed up in something this touchy, it seems unlikely
> > that it'll to better if we give it a do-over...  I'm not sure whether
> > killing the whole thing (victim and tracer) is the right thing or not.
> 
> I suppose we could do that too. I just didn't want to get into a
> situation where the notification is lost and the task is stuck because
> userspace screwed up here.

Yeah, that's probably best - I'm just trying to see whether there is
any way that this could be abused.  My underactive imagination hasn't
come up with anything yet.

-serge


Re: [PATCH v9 3/4] seccomp: add a return code to trap to userspace

2018-12-02 Thread Serge E. Hallyn
On Sun, Dec 02, 2018 at 08:28:26PM -0700, Tycho Andersen wrote:
> This patch introduces a means for syscalls matched in seccomp to notify
> some other task that a particular filter has been triggered.
> 
> The motivation for this is primarily for use with containers. For example,
> if a container does an init_module(), we obviously don't want to load this
> untrusted code, which may be compiled for the wrong version of the kernel
> anyway. Instead, we could parse the module image, figure out which module
> the container is trying to load and load it on the host.
> 
> As another example, containers cannot mount() in general since various
> filesystems assume a trusted image. However, if an orchestrator knows that
> e.g. a particular block device has not been exposed to a container for
> writing, it want to allow the container to mount that block device (that
> is, handle the mount for it).
> 
> This patch adds functionality that is already possible via at least two
> other means that I know about, both of which involve ptrace(): first, one
> could ptrace attach, and then iterate through syscalls via PTRACE_SYSCALL.
> Unfortunately this is slow, so a faster version would be to install a
> filter that does SECCOMP_RET_TRACE, which triggers a PTRACE_EVENT_SECCOMP.
> Since ptrace allows only one tracer, if the container runtime is that
> tracer, users inside the container (or outside) trying to debug it will not
> be able to use ptrace, which is annoying. It also means that older
> distributions based on Upstart cannot boot inside containers using ptrace,
> since upstart itself uses ptrace to monitor services while starting.
> 
> The actual implementation of this is fairly small, although getting the
> synchronization right was/is slightly complex.
> 
> Finally, it's worth noting that the classic seccomp TOCTOU of reading
> memory data from the task still applies here, but can be avoided with
> careful design of the userspace handler: if the userspace handler reads all
> of the task memory that is necessary before applying its security policy,
> the tracee's subsequent memory edits will not be read by the tracer.
> 
> Signed-off-by: Tycho Andersen 
> CC: Kees Cook 
> CC: Andy Lutomirski 
> CC: Oleg Nesterov 
> CC: Eric W. Biederman 
> CC: "Serge E. Hallyn" 

Acked-by: Serge Hallyn 

a few questions below, though, which I'm sure are just me reading too
late at night,

> CC: Christian Brauner 
> CC: Tyler Hicks 
> CC: Akihiro Suda 
> ---
> v2: * make id a u64; the idea here being that it will never overflow,
>   because 64 is huge (one syscall every nanosecond => wrap every 584
>   years) (Andy)
> * prevent nesting of user notifications: if someone is already attached
>   the tree in one place, nobody else can attach to the tree (Andy)
> * notify the listener of signals the tracee receives as well (Andy)
> * implement poll
> v3: * lockdep fix (Oleg)
> * drop unnecessary WARN()s (Christian)
> * rearrange error returns to be more rpetty (Christian)
> * fix build in !CONFIG_SECCOMP_USER_NOTIFICATION case
> v4: * fix implementation of poll to use poll_wait() (Jann)
> * change listener's fd flags to be 0 (Jann)
> * hoist filter initialization out of ifdefs to its own function
>   init_user_notification()
> * add some more testing around poll() and closing the listener while a
>   syscall is in action
> * s/GET_LISTENER/NEW_LISTENER, since you can't _get_ a listener, but it
>   creates a new one (Matthew)
> * correctly handle pid namespaces, add some testcases (Matthew)
> * use EINPROGRESS instead of EINVAL when a notification response is
>   written twice (Matthew)
> * fix comment typo from older version (SEND vs READ) (Matthew)
> * whitespace and logic simplification (Tobin)
> * add some Documentation/ bits on userspace trapping
> v5: * fix documentation typos (Jann)
> * add signalled field to struct seccomp_notif (Jann)
> * switch to using ioctls instead of read()/write() for struct passing
>   (Jann)
> * add an ioctl to ensure an id is still valid
> v6: * docs typo fixes, update docs for ioctl() change (Christian)
> v7: * switch struct seccomp_knotif's id member to a u64 (derp :)
> * use notify_lock in IS_ID_VALID query to avoid racing
> * s/signalled/signaled (Tyler)
> * fix docs to reflect that ids are not globally unique (Tyler)
> * add a test to check -ERESTARTSYS behavior (Tyler)
> * drop CONFIG_SECCOMP_USER_NOTIFICATION (Tyler)
> * reorder USER_NOTIF in seccomp return codes list (Tyler)
> * return size instead of sizeof(struct user_notif) (Tyler)
> * ENOENT instead of EINVAL when invalid id is passed (Tyler)
&

Re: [PATCH v9 3/4] seccomp: add a return code to trap to userspace

2018-12-02 Thread Serge E. Hallyn
On Sun, Dec 02, 2018 at 08:28:26PM -0700, Tycho Andersen wrote:
> This patch introduces a means for syscalls matched in seccomp to notify
> some other task that a particular filter has been triggered.
> 
> The motivation for this is primarily for use with containers. For example,
> if a container does an init_module(), we obviously don't want to load this
> untrusted code, which may be compiled for the wrong version of the kernel
> anyway. Instead, we could parse the module image, figure out which module
> the container is trying to load and load it on the host.
> 
> As another example, containers cannot mount() in general since various
> filesystems assume a trusted image. However, if an orchestrator knows that
> e.g. a particular block device has not been exposed to a container for
> writing, it want to allow the container to mount that block device (that
> is, handle the mount for it).
> 
> This patch adds functionality that is already possible via at least two
> other means that I know about, both of which involve ptrace(): first, one
> could ptrace attach, and then iterate through syscalls via PTRACE_SYSCALL.
> Unfortunately this is slow, so a faster version would be to install a
> filter that does SECCOMP_RET_TRACE, which triggers a PTRACE_EVENT_SECCOMP.
> Since ptrace allows only one tracer, if the container runtime is that
> tracer, users inside the container (or outside) trying to debug it will not
> be able to use ptrace, which is annoying. It also means that older
> distributions based on Upstart cannot boot inside containers using ptrace,
> since upstart itself uses ptrace to monitor services while starting.
> 
> The actual implementation of this is fairly small, although getting the
> synchronization right was/is slightly complex.
> 
> Finally, it's worth noting that the classic seccomp TOCTOU of reading
> memory data from the task still applies here, but can be avoided with
> careful design of the userspace handler: if the userspace handler reads all
> of the task memory that is necessary before applying its security policy,
> the tracee's subsequent memory edits will not be read by the tracer.
> 
> Signed-off-by: Tycho Andersen 
> CC: Kees Cook 
> CC: Andy Lutomirski 
> CC: Oleg Nesterov 
> CC: Eric W. Biederman 
> CC: "Serge E. Hallyn" 

Acked-by: Serge Hallyn 

a few questions below, though, which I'm sure are just me reading too
late at night,

> CC: Christian Brauner 
> CC: Tyler Hicks 
> CC: Akihiro Suda 
> ---
> v2: * make id a u64; the idea here being that it will never overflow,
>   because 64 is huge (one syscall every nanosecond => wrap every 584
>   years) (Andy)
> * prevent nesting of user notifications: if someone is already attached
>   the tree in one place, nobody else can attach to the tree (Andy)
> * notify the listener of signals the tracee receives as well (Andy)
> * implement poll
> v3: * lockdep fix (Oleg)
> * drop unnecessary WARN()s (Christian)
> * rearrange error returns to be more rpetty (Christian)
> * fix build in !CONFIG_SECCOMP_USER_NOTIFICATION case
> v4: * fix implementation of poll to use poll_wait() (Jann)
> * change listener's fd flags to be 0 (Jann)
> * hoist filter initialization out of ifdefs to its own function
>   init_user_notification()
> * add some more testing around poll() and closing the listener while a
>   syscall is in action
> * s/GET_LISTENER/NEW_LISTENER, since you can't _get_ a listener, but it
>   creates a new one (Matthew)
> * correctly handle pid namespaces, add some testcases (Matthew)
> * use EINPROGRESS instead of EINVAL when a notification response is
>   written twice (Matthew)
> * fix comment typo from older version (SEND vs READ) (Matthew)
> * whitespace and logic simplification (Tobin)
> * add some Documentation/ bits on userspace trapping
> v5: * fix documentation typos (Jann)
> * add signalled field to struct seccomp_notif (Jann)
> * switch to using ioctls instead of read()/write() for struct passing
>   (Jann)
> * add an ioctl to ensure an id is still valid
> v6: * docs typo fixes, update docs for ioctl() change (Christian)
> v7: * switch struct seccomp_knotif's id member to a u64 (derp :)
> * use notify_lock in IS_ID_VALID query to avoid racing
> * s/signalled/signaled (Tyler)
> * fix docs to reflect that ids are not globally unique (Tyler)
> * add a test to check -ERESTARTSYS behavior (Tyler)
> * drop CONFIG_SECCOMP_USER_NOTIFICATION (Tyler)
> * reorder USER_NOTIF in seccomp return codes list (Tyler)
> * return size instead of sizeof(struct user_notif) (Tyler)
> * ENOENT instead of EINVAL when invalid id is passed (Tyler)
&

Re: [PATCH v9 2/4] seccomp: switch system call argument type to void *

2018-12-02 Thread Serge E. Hallyn
On Sun, Dec 02, 2018 at 08:28:25PM -0700, Tycho Andersen wrote:
> The const qualifier causes problems for any code that wants to write to the
> third argument of the seccomp syscall, as we will do in a future patch in
> this series.
> 
> The third argument to the seccomp syscall is documented as void *, so
> rather than just dropping the const, let's switch everything to use void *
> as well.
> 
> I believe this is safe because of 1. the documentation above, 2. there's no
> real type information exported about syscalls anywhere besides the man
> pages.
> 
> Signed-off-by: Tycho Andersen 
> CC: Kees Cook 
> CC: Andy Lutomirski 
> CC: Oleg Nesterov 
> CC: Eric W. Biederman 
> CC: "Serge E. Hallyn" 

Acked-by: Serge Hallyn 

Though I'm not entirely convinced there will be no ill effects of changing
the argument type.  I'll feel comfortable when Michael and Paul say it's
fine :)

> CC: Christian Brauner 
> CC: Tyler Hicks 
> CC: Akihiro Suda 
> ---
>  include/linux/seccomp.h | 2 +-
>  kernel/seccomp.c| 8 
>  2 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index e5320f6c8654..b5103c019cf4 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -43,7 +43,7 @@ extern void secure_computing_strict(int this_syscall);
>  #endif
>  
>  extern long prctl_get_seccomp(void);
> -extern long prctl_set_seccomp(unsigned long, char __user *);
> +extern long prctl_set_seccomp(unsigned long, void __user *);
>  
>  static inline int seccomp_mode(struct seccomp *s)
>  {
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 96afc32e041d..393e029f778a 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -924,7 +924,7 @@ static long seccomp_get_action_avail(const char __user 
> *uaction)
>  
>  /* Common entry point for both prctl and syscall. */
>  static long do_seccomp(unsigned int op, unsigned int flags,
> -const char __user *uargs)
> +void __user *uargs)
>  {
>   switch (op) {
>   case SECCOMP_SET_MODE_STRICT:
> @@ -944,7 +944,7 @@ static long do_seccomp(unsigned int op, unsigned int 
> flags,
>  }
>  
>  SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
> -  const char __user *, uargs)
> +  void __user *, uargs)
>  {
>   return do_seccomp(op, flags, uargs);
>  }
> @@ -956,10 +956,10 @@ SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned 
> int, flags,
>   *
>   * Returns 0 on success or -EINVAL on failure.
>   */
> -long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
> +long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
>  {
>   unsigned int op;
> - char __user *uargs;
> + void __user *uargs;
>  
>   switch (seccomp_mode) {
>   case SECCOMP_MODE_STRICT:
> -- 
> 2.19.1


Re: [PATCH v9 2/4] seccomp: switch system call argument type to void *

2018-12-02 Thread Serge E. Hallyn
On Sun, Dec 02, 2018 at 08:28:25PM -0700, Tycho Andersen wrote:
> The const qualifier causes problems for any code that wants to write to the
> third argument of the seccomp syscall, as we will do in a future patch in
> this series.
> 
> The third argument to the seccomp syscall is documented as void *, so
> rather than just dropping the const, let's switch everything to use void *
> as well.
> 
> I believe this is safe because of 1. the documentation above, 2. there's no
> real type information exported about syscalls anywhere besides the man
> pages.
> 
> Signed-off-by: Tycho Andersen 
> CC: Kees Cook 
> CC: Andy Lutomirski 
> CC: Oleg Nesterov 
> CC: Eric W. Biederman 
> CC: "Serge E. Hallyn" 

Acked-by: Serge Hallyn 

Though I'm not entirely convinced there will be no ill effects of changing
the argument type.  I'll feel comfortable when Michael and Paul say it's
fine :)

> CC: Christian Brauner 
> CC: Tyler Hicks 
> CC: Akihiro Suda 
> ---
>  include/linux/seccomp.h | 2 +-
>  kernel/seccomp.c| 8 
>  2 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index e5320f6c8654..b5103c019cf4 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -43,7 +43,7 @@ extern void secure_computing_strict(int this_syscall);
>  #endif
>  
>  extern long prctl_get_seccomp(void);
> -extern long prctl_set_seccomp(unsigned long, char __user *);
> +extern long prctl_set_seccomp(unsigned long, void __user *);
>  
>  static inline int seccomp_mode(struct seccomp *s)
>  {
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 96afc32e041d..393e029f778a 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -924,7 +924,7 @@ static long seccomp_get_action_avail(const char __user 
> *uaction)
>  
>  /* Common entry point for both prctl and syscall. */
>  static long do_seccomp(unsigned int op, unsigned int flags,
> -const char __user *uargs)
> +void __user *uargs)
>  {
>   switch (op) {
>   case SECCOMP_SET_MODE_STRICT:
> @@ -944,7 +944,7 @@ static long do_seccomp(unsigned int op, unsigned int 
> flags,
>  }
>  
>  SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
> -  const char __user *, uargs)
> +  void __user *, uargs)
>  {
>   return do_seccomp(op, flags, uargs);
>  }
> @@ -956,10 +956,10 @@ SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned 
> int, flags,
>   *
>   * Returns 0 on success or -EINVAL on failure.
>   */
> -long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
> +long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
>  {
>   unsigned int op;
> - char __user *uargs;
> + void __user *uargs;
>  
>   switch (seccomp_mode) {
>   case SECCOMP_MODE_STRICT:
> -- 
> 2.19.1


Re: [PATCH v9 1/4] seccomp: hoist struct seccomp_data recalculation higher

2018-12-02 Thread Serge E. Hallyn
On Sun, Dec 02, 2018 at 08:28:24PM -0700, Tycho Andersen wrote:
> In the next patch, we're going to use the sd pointer passed to
> __seccomp_filter() as the data to pass to userspace. Except that in some
> cases (__seccomp_filter(SECCOMP_RET_TRACE), emulate_vsyscall(), every time
> seccomp is inovked on power, etc.) the sd pointer will be NULL in order to
> force seccomp to recompute the register data. Previously this recomputation
> happened one level lower, in seccomp_run_filters(); this patch just moves
> it up a level higher to __seccomp_filter().
> 
> Thanks Oleg for spotting this.
> 
> Signed-off-by: Tycho Andersen 
> CC: Kees Cook 
> CC: Andy Lutomirski 
> CC: Oleg Nesterov 
> CC: Eric W. Biederman 
> CC: "Serge E. Hallyn" 

Acked-by: Serge Hallyn 

> CC: Christian Brauner 
> CC: Tyler Hicks 
> CC: Akihiro Suda 
> ---
>  kernel/seccomp.c | 12 ++--
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index f2ae2324c232..96afc32e041d 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -188,7 +188,6 @@ static int seccomp_check_filter(struct sock_filter 
> *filter, unsigned int flen)
>  static u32 seccomp_run_filters(const struct seccomp_data *sd,
>  struct seccomp_filter **match)
>  {
> - struct seccomp_data sd_local;
>   u32 ret = SECCOMP_RET_ALLOW;
>   /* Make sure cross-thread synced filter points somewhere sane. */
>   struct seccomp_filter *f =
> @@ -198,11 +197,6 @@ static u32 seccomp_run_filters(const struct seccomp_data 
> *sd,
>   if (WARN_ON(f == NULL))
>   return SECCOMP_RET_KILL_PROCESS;
>  
> - if (!sd) {
> - populate_seccomp_data(_local);
> - sd = _local;
> - }
> -
>   /*
>* All filters in the list are evaluated and the lowest BPF return
>* value always takes priority (ignoring the DATA).
> @@ -658,6 +652,7 @@ static int __seccomp_filter(int this_syscall, const 
> struct seccomp_data *sd,
>   u32 filter_ret, action;
>   struct seccomp_filter *match = NULL;
>   int data;
> + struct seccomp_data sd_local;
>  
>   /*
>* Make sure that any changes to mode from another thread have
> @@ -665,6 +660,11 @@ static int __seccomp_filter(int this_syscall, const 
> struct seccomp_data *sd,
>*/
>   rmb();
>  
> + if (!sd) {
> + populate_seccomp_data(_local);
> + sd = _local;
> + }
> +
>   filter_ret = seccomp_run_filters(sd, );
>   data = filter_ret & SECCOMP_RET_DATA;
>   action = filter_ret & SECCOMP_RET_ACTION_FULL;
> -- 
> 2.19.1


Re: [PATCH v9 1/4] seccomp: hoist struct seccomp_data recalculation higher

2018-12-02 Thread Serge E. Hallyn
On Sun, Dec 02, 2018 at 08:28:24PM -0700, Tycho Andersen wrote:
> In the next patch, we're going to use the sd pointer passed to
> __seccomp_filter() as the data to pass to userspace. Except that in some
> cases (__seccomp_filter(SECCOMP_RET_TRACE), emulate_vsyscall(), every time
> seccomp is inovked on power, etc.) the sd pointer will be NULL in order to
> force seccomp to recompute the register data. Previously this recomputation
> happened one level lower, in seccomp_run_filters(); this patch just moves
> it up a level higher to __seccomp_filter().
> 
> Thanks Oleg for spotting this.
> 
> Signed-off-by: Tycho Andersen 
> CC: Kees Cook 
> CC: Andy Lutomirski 
> CC: Oleg Nesterov 
> CC: Eric W. Biederman 
> CC: "Serge E. Hallyn" 

Acked-by: Serge Hallyn 

> CC: Christian Brauner 
> CC: Tyler Hicks 
> CC: Akihiro Suda 
> ---
>  kernel/seccomp.c | 12 ++--
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index f2ae2324c232..96afc32e041d 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -188,7 +188,6 @@ static int seccomp_check_filter(struct sock_filter 
> *filter, unsigned int flen)
>  static u32 seccomp_run_filters(const struct seccomp_data *sd,
>  struct seccomp_filter **match)
>  {
> - struct seccomp_data sd_local;
>   u32 ret = SECCOMP_RET_ALLOW;
>   /* Make sure cross-thread synced filter points somewhere sane. */
>   struct seccomp_filter *f =
> @@ -198,11 +197,6 @@ static u32 seccomp_run_filters(const struct seccomp_data 
> *sd,
>   if (WARN_ON(f == NULL))
>   return SECCOMP_RET_KILL_PROCESS;
>  
> - if (!sd) {
> - populate_seccomp_data(_local);
> - sd = _local;
> - }
> -
>   /*
>* All filters in the list are evaluated and the lowest BPF return
>* value always takes priority (ignoring the DATA).
> @@ -658,6 +652,7 @@ static int __seccomp_filter(int this_syscall, const 
> struct seccomp_data *sd,
>   u32 filter_ret, action;
>   struct seccomp_filter *match = NULL;
>   int data;
> + struct seccomp_data sd_local;
>  
>   /*
>* Make sure that any changes to mode from another thread have
> @@ -665,6 +660,11 @@ static int __seccomp_filter(int this_syscall, const 
> struct seccomp_data *sd,
>*/
>   rmb();
>  
> + if (!sd) {
> + populate_seccomp_data(_local);
> + sd = _local;
> + }
> +
>   filter_ret = seccomp_run_filters(sd, );
>   data = filter_ret & SECCOMP_RET_DATA;
>   action = filter_ret & SECCOMP_RET_ACTION_FULL;
> -- 
> 2.19.1


Re: [PATCH v2] signal: add procfd_signal() syscall

2018-11-22 Thread Serge E. Hallyn
On Tue, Nov 20, 2018 at 11:51:23AM +0100, Christian Brauner wrote:
> The kill() syscall operates on process identifiers. After a process has
> exited its pid can be reused by another process. If a caller sends a signal
> to a reused pid it will end up signaling the wrong process. This issue has
> often surfaced and there has been a push [1] to address this problem.
> 
> This patch uses file descriptors from proc/ as stable handles on
> struct pid. Even if a pid is recycled the handle will not change. The file
> descriptor can be used to send signals to the referenced process.
> Thus, the  new syscall procfd_signal() is introduced to solve this problem.
> It operates on a process file descriptor.
> The syscall takes an additional siginfo_t and flags argument. If siginfo_t
> is NULL then procfd_signal() behaves like kill() if it is not NULL it
> behaves like rt_sigqueueinfo.
> The flags argument is added to allow for future extensions of this syscall.
> It currently needs to be passed as 0.
> 
> With this patch a process can be killed via:
> 
>  #define _GNU_SOURCE
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
> 
>  int main(int argc, char *argv[])
>  {
>  int ret;
>  char buf[1000];
> 
>  if (argc < 2)
>  exit(EXIT_FAILURE);
> 
>  ret = snprintf(buf, sizeof(buf), "/proc/%s", argv[1]);
>  if (ret < 0)

  || ret > sizeof(buf) ? :-)  I mean, you *are* passing the string...

>  exit(EXIT_FAILURE);
> 
>  int fd = open(buf, O_DIRECTORY | O_CLOEXEC);
>  if (fd < 0) {
>  printf("%s - Failed to open \"%s\"\n", strerror(errno), buf);
>  exit(EXIT_FAILURE);
>  }
> 
>  ret = syscall(__NR_procfd_signal, fd, SIGKILL, NULL, 0);
>  if (ret < 0) {
>  printf("Failed to send SIGKILL \"%s\"\n", strerror(errno));
>  close(fd);
>  exit(EXIT_FAILURE);
>  }
> 
>  close(fd);
> 
>  exit(EXIT_SUCCESS);
>  }
> 
> [1]: https://lkml.org/lkml/2018/11/18/130
> 
> Cc: "Eric W. Biederman" 
> Cc: Serge Hallyn 

Acked-by: Serge Hallyn 

> Cc: Jann Horn 
> Cc: Kees Cook 
> Cc: Andy Lutomirsky 
> Cc: Andrew Morton 
> Cc: Oleg Nesterov 
> Cc: Aleksa Sarai 
> Cc: Al Viro 
> Signed-off-by: Christian Brauner 
> ---
> Changelog:
> v2:
> - define __NR_procfd_signal in unistd.h
> - wire up compat syscall
> - s/proc_is_procfd/proc_is_tgid_procfd/g
> - provide stubs when CONFIG_PROC_FS=n
> - move proc_pid() to linux/proc_fs.h header
> - use proc_pid() to grab struct pid from /proc/ fd
> v1:
> - patch introduced
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   2 +
>  fs/proc/base.c |  11 ++-
>  fs/proc/internal.h |   5 -
>  include/linux/proc_fs.h|  12 +++
>  include/linux/syscalls.h   |   2 +
>  include/uapi/asm-generic/unistd.h  |   4 +-
>  kernel/signal.c| 127 +++--
>  8 files changed, 151 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
> b/arch/x86/entry/syscalls/syscall_32.tbl
> index 3cf7b533b3d1..3f27ffd8ae87 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -398,3 +398,4 @@
>  384  i386arch_prctl  sys_arch_prctl  
> __ia32_compat_sys_arch_prctl
>  385  i386io_pgetevents   sys_io_pgetevents   
> __ia32_compat_sys_io_pgetevents
>  386  i386rseqsys_rseq
> __ia32_sys_rseq
> +387  i386procfd_signal   sys_procfd_signal   
> __ia32_compat_sys_procfd_signal
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
> b/arch/x86/entry/syscalls/syscall_64.tbl
> index f0b1709a5ffb..8a30cde82450 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -343,6 +343,7 @@
>  332  common  statx   __x64_sys_statx
>  333  common  io_pgetevents   __x64_sys_io_pgetevents
>  334  common  rseq__x64_sys_rseq
> +335  64  procfd_signal   __x64_sys_procfd_signal
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> @@ -386,3 +387,4 @@
>  545  x32 execveat__x32_compat_sys_execveat/ptregs
>  546  x32 preadv2 __x32_compat_sys_preadv64v2
>  547  x32 pwritev2__x32_compat_sys_pwritev64v2
> +548  x32 procfd_signal   __x32_compat_sys_procfd_signal
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index ce3465479447..771c6bd1cac6 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -716,7 +716,10 @@ static int proc_pid_permission(struct inode *inode, int 
> mask)
>   return 

Re: [PATCH v2] signal: add procfd_signal() syscall

2018-11-22 Thread Serge E. Hallyn
On Tue, Nov 20, 2018 at 11:51:23AM +0100, Christian Brauner wrote:
> The kill() syscall operates on process identifiers. After a process has
> exited its pid can be reused by another process. If a caller sends a signal
> to a reused pid it will end up signaling the wrong process. This issue has
> often surfaced and there has been a push [1] to address this problem.
> 
> This patch uses file descriptors from proc/ as stable handles on
> struct pid. Even if a pid is recycled the handle will not change. The file
> descriptor can be used to send signals to the referenced process.
> Thus, the  new syscall procfd_signal() is introduced to solve this problem.
> It operates on a process file descriptor.
> The syscall takes an additional siginfo_t and flags argument. If siginfo_t
> is NULL then procfd_signal() behaves like kill() if it is not NULL it
> behaves like rt_sigqueueinfo.
> The flags argument is added to allow for future extensions of this syscall.
> It currently needs to be passed as 0.
> 
> With this patch a process can be killed via:
> 
>  #define _GNU_SOURCE
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
> 
>  int main(int argc, char *argv[])
>  {
>  int ret;
>  char buf[1000];
> 
>  if (argc < 2)
>  exit(EXIT_FAILURE);
> 
>  ret = snprintf(buf, sizeof(buf), "/proc/%s", argv[1]);
>  if (ret < 0)

  || ret > sizeof(buf) ? :-)  I mean, you *are* passing the string...

>  exit(EXIT_FAILURE);
> 
>  int fd = open(buf, O_DIRECTORY | O_CLOEXEC);
>  if (fd < 0) {
>  printf("%s - Failed to open \"%s\"\n", strerror(errno), buf);
>  exit(EXIT_FAILURE);
>  }
> 
>  ret = syscall(__NR_procfd_signal, fd, SIGKILL, NULL, 0);
>  if (ret < 0) {
>  printf("Failed to send SIGKILL \"%s\"\n", strerror(errno));
>  close(fd);
>  exit(EXIT_FAILURE);
>  }
> 
>  close(fd);
> 
>  exit(EXIT_SUCCESS);
>  }
> 
> [1]: https://lkml.org/lkml/2018/11/18/130
> 
> Cc: "Eric W. Biederman" 
> Cc: Serge Hallyn 

Acked-by: Serge Hallyn 

> Cc: Jann Horn 
> Cc: Kees Cook 
> Cc: Andy Lutomirsky 
> Cc: Andrew Morton 
> Cc: Oleg Nesterov 
> Cc: Aleksa Sarai 
> Cc: Al Viro 
> Signed-off-by: Christian Brauner 
> ---
> Changelog:
> v2:
> - define __NR_procfd_signal in unistd.h
> - wire up compat syscall
> - s/proc_is_procfd/proc_is_tgid_procfd/g
> - provide stubs when CONFIG_PROC_FS=n
> - move proc_pid() to linux/proc_fs.h header
> - use proc_pid() to grab struct pid from /proc/ fd
> v1:
> - patch introduced
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   2 +
>  fs/proc/base.c |  11 ++-
>  fs/proc/internal.h |   5 -
>  include/linux/proc_fs.h|  12 +++
>  include/linux/syscalls.h   |   2 +
>  include/uapi/asm-generic/unistd.h  |   4 +-
>  kernel/signal.c| 127 +++--
>  8 files changed, 151 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
> b/arch/x86/entry/syscalls/syscall_32.tbl
> index 3cf7b533b3d1..3f27ffd8ae87 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -398,3 +398,4 @@
>  384  i386arch_prctl  sys_arch_prctl  
> __ia32_compat_sys_arch_prctl
>  385  i386io_pgetevents   sys_io_pgetevents   
> __ia32_compat_sys_io_pgetevents
>  386  i386rseqsys_rseq
> __ia32_sys_rseq
> +387  i386procfd_signal   sys_procfd_signal   
> __ia32_compat_sys_procfd_signal
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
> b/arch/x86/entry/syscalls/syscall_64.tbl
> index f0b1709a5ffb..8a30cde82450 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -343,6 +343,7 @@
>  332  common  statx   __x64_sys_statx
>  333  common  io_pgetevents   __x64_sys_io_pgetevents
>  334  common  rseq__x64_sys_rseq
> +335  64  procfd_signal   __x64_sys_procfd_signal
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> @@ -386,3 +387,4 @@
>  545  x32 execveat__x32_compat_sys_execveat/ptregs
>  546  x32 preadv2 __x32_compat_sys_preadv64v2
>  547  x32 pwritev2__x32_compat_sys_pwritev64v2
> +548  x32 procfd_signal   __x32_compat_sys_procfd_signal
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index ce3465479447..771c6bd1cac6 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -716,7 +716,10 @@ static int proc_pid_permission(struct inode *inode, int 
> mask)
>   return 

Re: [PATCH v1 2/2] signal: add procfd_signal() syscall

2018-11-21 Thread Serge E. Hallyn
On Mon, Nov 19, 2018 at 03:39:54PM -0700, Tycho Andersen wrote:
> On Mon, Nov 19, 2018 at 11:32:39AM +0100, Christian Brauner wrote:
> >
> > +/**
> > + *  sys_procfd_signal - send a signal to a process through a process file
> > + *  descriptor
> > + *  @fd: the file descriptor of the process
> > + *  @sig: signal to be sent
> > + *  @info: the signal info
> > + *  @flags: future flags to be passed
> > + */
> > +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user *, info,
> > +   int, flags)
> > +{
> 
> Can I just register an objection here that I think using a syscall
> just for this is silly?
> 
> My understanding is that the concern is that some code might do:
> 
> unknown_fd = recv_fd();
> ioctl(unknown_fd, SOME_IOCTL, NULL); // where SOME_IOCTL == PROC_FD_KILL
> // whoops, unknown_fd was a procfd and we killed a task!

This could just be my own mental model, but for something like "kill a
task", an ioctl just seems wrong.  Syscall seems more natural.

I'd ack either method.

-serge


Re: [PATCH v1 2/2] signal: add procfd_signal() syscall

2018-11-21 Thread Serge E. Hallyn
On Mon, Nov 19, 2018 at 03:39:54PM -0700, Tycho Andersen wrote:
> On Mon, Nov 19, 2018 at 11:32:39AM +0100, Christian Brauner wrote:
> >
> > +/**
> > + *  sys_procfd_signal - send a signal to a process through a process file
> > + *  descriptor
> > + *  @fd: the file descriptor of the process
> > + *  @sig: signal to be sent
> > + *  @info: the signal info
> > + *  @flags: future flags to be passed
> > + */
> > +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user *, info,
> > +   int, flags)
> > +{
> 
> Can I just register an objection here that I think using a syscall
> just for this is silly?
> 
> My understanding is that the concern is that some code might do:
> 
> unknown_fd = recv_fd();
> ioctl(unknown_fd, SOME_IOCTL, NULL); // where SOME_IOCTL == PROC_FD_KILL
> // whoops, unknown_fd was a procfd and we killed a task!

This could just be my own mental model, but for something like "kill a
task", an ioctl just seems wrong.  Syscall seems more natural.

I'd ack either method.

-serge


Re: [PATCH v1 2/2] signal: add procfd_signal() syscall

2018-11-21 Thread Serge E. Hallyn
On Tue, Nov 20, 2018 at 08:23:43AM +1100, Aleksa Sarai wrote:
> On 2018-11-20, Aleksa Sarai  wrote:
> > On 2018-11-19, Christian Brauner  wrote:
> > > On Tue, Nov 20, 2018 at 07:28:57AM +1100, Aleksa Sarai wrote:
> > > > On 2018-11-19, Christian Brauner  wrote:
> > > > > + if (info) {
> > > > > + ret = __copy_siginfo_from_user(sig, , info);
> > > > > + if (unlikely(ret))
> > > > > + goto err;
> > > > > + /*
> > > > > +  * Not even root can pretend to send signals from the 
> > > > > kernel.
> > > > > +  * Nor can they impersonate a kill()/tgkill(), which 
> > > > > adds
> > > > > +  * source info.
> > > > > +  */
> > > > > + ret = -EPERM;
> > > > > + if ((kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL) &&
> > > > > + (task_pid(current) != pid))
> > > > > + goto err;
> > > > > + } else {
> > > > > + prepare_kill_siginfo(sig, );
> > > > > + }
> > > > 
> > > > I wonder whether we should also have a pidns restriction here, since
> > > > currently it isn't possible for a container process using a pidns to
> > > > signal processes outside its pidns. AFAICS, this isn't done through an
> > > > explicit check -- it's a side-effect of processes in a pidns not being
> > > > able to address non-descendant-pidns processes.
> > > > 
> > > > But maybe it's reasonable to allow sending a procfd to a different pidns
> > > > and the same operations working on it? If we extend the procfd API to
> > > 
> > > No, I don't think so. I really don't want any fancy semantics in here.
> > > Fancy doesn't get merged and fancy is hard to maintain. So we should do
> > > something like:
> > > 
> > > if (proc_pid_ns() != current_pid_ns)
> > >   return EINVAL
> > 
> > This isn't quite sufficient. The key thing is that you have to be in an
> > *ancestor* (or same) pidns, not the *same* pidns. Ideally you can re-use
> > the check already in pidns_get_parent, and expose it. It would be
> > something as trivial as:
> > 
> > bool pidns_is_descendant(struct pid_namespace *ns,
> >  struct pid_namespace *ancestor)
> > {
> > for (;;) {
> > if (!ns)
> > return false;
> > if (ns == ancestor)
> > break;
> > ns = ns->parent;
> > }
> > return true;
> > }
> > 
> > And you can rewrite pidns_get_parent to use it. So you would instead be
> > doing:
> > 
> > if (pidns_is_descendant(proc_pid_ns, task_active_pid_ns(current)))
> > return -EPERM;
> 
> Scratch the last bit, -EPERM is wrong here. I would argue that -EINVAL
> is *somewhat* wrong because arguable the more semantically consistent
> error (with kill(2)) would be -ESRCH -- but then you're mixing the "pid
> is dead" and "pid is not visible to you" cases. I'm not sure what the
> right errno would be here (I'm sure some of the LKML greybeards will
> have a better clue.) :P

Actually I like EXDEV for this.  ERMOTE also works.


Re: [PATCH v1 2/2] signal: add procfd_signal() syscall

2018-11-21 Thread Serge E. Hallyn
On Tue, Nov 20, 2018 at 08:23:43AM +1100, Aleksa Sarai wrote:
> On 2018-11-20, Aleksa Sarai  wrote:
> > On 2018-11-19, Christian Brauner  wrote:
> > > On Tue, Nov 20, 2018 at 07:28:57AM +1100, Aleksa Sarai wrote:
> > > > On 2018-11-19, Christian Brauner  wrote:
> > > > > + if (info) {
> > > > > + ret = __copy_siginfo_from_user(sig, , info);
> > > > > + if (unlikely(ret))
> > > > > + goto err;
> > > > > + /*
> > > > > +  * Not even root can pretend to send signals from the 
> > > > > kernel.
> > > > > +  * Nor can they impersonate a kill()/tgkill(), which 
> > > > > adds
> > > > > +  * source info.
> > > > > +  */
> > > > > + ret = -EPERM;
> > > > > + if ((kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL) &&
> > > > > + (task_pid(current) != pid))
> > > > > + goto err;
> > > > > + } else {
> > > > > + prepare_kill_siginfo(sig, );
> > > > > + }
> > > > 
> > > > I wonder whether we should also have a pidns restriction here, since
> > > > currently it isn't possible for a container process using a pidns to
> > > > signal processes outside its pidns. AFAICS, this isn't done through an
> > > > explicit check -- it's a side-effect of processes in a pidns not being
> > > > able to address non-descendant-pidns processes.
> > > > 
> > > > But maybe it's reasonable to allow sending a procfd to a different pidns
> > > > and the same operations working on it? If we extend the procfd API to
> > > 
> > > No, I don't think so. I really don't want any fancy semantics in here.
> > > Fancy doesn't get merged and fancy is hard to maintain. So we should do
> > > something like:
> > > 
> > > if (proc_pid_ns() != current_pid_ns)
> > >   return EINVAL
> > 
> > This isn't quite sufficient. The key thing is that you have to be in an
> > *ancestor* (or same) pidns, not the *same* pidns. Ideally you can re-use
> > the check already in pidns_get_parent, and expose it. It would be
> > something as trivial as:
> > 
> > bool pidns_is_descendant(struct pid_namespace *ns,
> >  struct pid_namespace *ancestor)
> > {
> > for (;;) {
> > if (!ns)
> > return false;
> > if (ns == ancestor)
> > break;
> > ns = ns->parent;
> > }
> > return true;
> > }
> > 
> > And you can rewrite pidns_get_parent to use it. So you would instead be
> > doing:
> > 
> > if (pidns_is_descendant(proc_pid_ns, task_active_pid_ns(current)))
> > return -EPERM;
> 
> Scratch the last bit, -EPERM is wrong here. I would argue that -EINVAL
> is *somewhat* wrong because arguable the more semantically consistent
> error (with kill(2)) would be -ESRCH -- but then you're mixing the "pid
> is dead" and "pid is not visible to you" cases. I'm not sure what the
> right errno would be here (I'm sure some of the LKML greybeards will
> have a better clue.) :P

Actually I like EXDEV for this.  ERMOTE also works.


Re: [PATCH v1 2/2] signal: add procfd_signal() syscall

2018-11-21 Thread Serge E. Hallyn
On Tue, Nov 20, 2018 at 11:31:13AM +0100, Christian Brauner wrote:
> On Mon, Nov 19, 2018 at 10:59:12PM -0600, Eric W. Biederman wrote:
> > Daniel Colascione  writes:
> > 
> > > On Mon, Nov 19, 2018 at 1:37 PM Christian Brauner  
> > > wrote:
> > >>
> > >> On Mon, Nov 19, 2018 at 01:26:22PM -0800, Daniel Colascione wrote:
> > >> > On Mon, Nov 19, 2018 at 1:21 PM, Christian Brauner 
> > >> >  wrote:
> > >> > > That can be done without a loop by comparing the level counter for 
> > >> > > the
> > >> > > two pid namespaces.
> > >> > >
> > >> > >>
> > >> > >> And you can rewrite pidns_get_parent to use it. So you would 
> > >> > >> instead be
> > >> > >> doing:
> > >> > >>
> > >> > >> if (pidns_is_descendant(proc_pid_ns, 
> > >> > >> task_active_pid_ns(current)))
> > >> > >> return -EPERM;
> > >> > >>
> > >> > >> (Or you can just copy the 5-line loop into procfd_signal -- though I
> > >> > >> imagine we'll need this for all of the procfd_* APIs.)
> > >> >
> > >> > Why is any of this even necessary? Why does the child namespace we're
> > >> > considering even have a file descriptor to its ancestor's procfs? If
> > >>
> > >> Because you can send file descriptors between processes and container
> > >> runtimes tend to do that.
> > >
> > > Right. But why *would* a container runtime send one of these procfs
> > > FDs to a container?
> > >
> > >> > it has one of these FDs, it can already *read* all sorts of
> > >> > information it really shouldn't be able to acquire, so the additional
> > >> > ability to send a signal (subject to the usual permission checks)
> > >> > feels like sticking a finger in a dike that's already well-perforated.
> > >> > IMHO, we shouldn't bother with this check. The patch would be simpler
> > >> > without it.
> > >>
> > >> We will definitely not allow signaling processes in an ancestor pid
> > >> namespace! That is a security issue! I can imagine container runtimes
> > >> killing their monitoring process etc. pp. Not happening, unless someone
> > >> with deep expertise in signals can convince me otherwise.
> > >
> > > If parent namespace procfs FDs or mounts really can leak into child
> > > namespaces as easily as Aleksa says, then I don't mind adding the
> > > check. I was under the impression that if you find yourself in this
> > > situation, you already have a big problem.
> > 
> > There is one big reason to have the check, and I have not seen it
> > mentioned yet in this thread.
> > 
> > When SI_USER is set we report the pid of the sender of the signal in
> > si_pid.  When the signal comes from the kernel si_pid == 0.  When signal
> > is sent from an ancestor pid namespace si_pid also equals 0 (which is
> > reasonable).
> > 
> > A signal out to a process in a parent pid namespace such as SIGCHLD is
> > reasonable as we can map the pid.  I really don't see the point of
> > forbidding that.  From the perspective of the process in the parent pid
> > namespace it is just another process in it's pid namespace.  So it
> > should pose no problem from the perspective of the receiving process.
> > 
> > A signal to a process in a pid namespace that is neither a parent nor a
> > descendent pid namespace would be a problem, as there is no well defined
> > notion of what si_pid should be set to.  So for that case perhaps we
> > should have something like a noprocess pid that we can set.  Perhaps we
> > could set si_pid to 0x.  That would take a small extension to
> > pid_nr_ns.
> > 
> > File descriptors are not namespaced.  It is completely legitimate to use
> > file descriptors to get around limitations of namespaces.
> 
> Frankly, I don't see a good argument for why we would allow that even if
> safe. I have not heard a legitimate use-case or need for this.
> At this point I care about very simple semantics. Being able to signal
> into ancestor pid namespaces and cousin namespaces is interesting but
> makes the syscall more brittle and harder to understand.

Yeah, I'm with you on that.  We can always open that door later if a good
use case comes up, but I prefer simple at first.

> Changing pid_nr_ns() might be the solution but this function is called
> all over the place in the kernel and I'm not going to risk breaking
> something by changing it for a feature that no one so far has ever
> asked for.
> If you are ok with this then we should hold off on this. We can always
> add this feature later by removing the check when someone has a use-case
> for it.
> I'll send a v2 of the patch that keeps the restriction for now. If you
> insist on it being removed we can make the change in a follow-up
> iteration.
> 
> Christian
> 
> > 
> > Adding limitations to a file descriptor based api because someone else
> > can't set up their processes in such a way as to get the restrictions
> > they are looking for seems very sad.
> > 
> > Frankly I think it is one of the better features of namespaces that we
> > have to carefully handle and define these cases so that when the
> > inevitable 

Re: [PATCH v1 2/2] signal: add procfd_signal() syscall

2018-11-21 Thread Serge E. Hallyn
On Tue, Nov 20, 2018 at 11:31:13AM +0100, Christian Brauner wrote:
> On Mon, Nov 19, 2018 at 10:59:12PM -0600, Eric W. Biederman wrote:
> > Daniel Colascione  writes:
> > 
> > > On Mon, Nov 19, 2018 at 1:37 PM Christian Brauner  
> > > wrote:
> > >>
> > >> On Mon, Nov 19, 2018 at 01:26:22PM -0800, Daniel Colascione wrote:
> > >> > On Mon, Nov 19, 2018 at 1:21 PM, Christian Brauner 
> > >> >  wrote:
> > >> > > That can be done without a loop by comparing the level counter for 
> > >> > > the
> > >> > > two pid namespaces.
> > >> > >
> > >> > >>
> > >> > >> And you can rewrite pidns_get_parent to use it. So you would 
> > >> > >> instead be
> > >> > >> doing:
> > >> > >>
> > >> > >> if (pidns_is_descendant(proc_pid_ns, 
> > >> > >> task_active_pid_ns(current)))
> > >> > >> return -EPERM;
> > >> > >>
> > >> > >> (Or you can just copy the 5-line loop into procfd_signal -- though I
> > >> > >> imagine we'll need this for all of the procfd_* APIs.)
> > >> >
> > >> > Why is any of this even necessary? Why does the child namespace we're
> > >> > considering even have a file descriptor to its ancestor's procfs? If
> > >>
> > >> Because you can send file descriptors between processes and container
> > >> runtimes tend to do that.
> > >
> > > Right. But why *would* a container runtime send one of these procfs
> > > FDs to a container?
> > >
> > >> > it has one of these FDs, it can already *read* all sorts of
> > >> > information it really shouldn't be able to acquire, so the additional
> > >> > ability to send a signal (subject to the usual permission checks)
> > >> > feels like sticking a finger in a dike that's already well-perforated.
> > >> > IMHO, we shouldn't bother with this check. The patch would be simpler
> > >> > without it.
> > >>
> > >> We will definitely not allow signaling processes in an ancestor pid
> > >> namespace! That is a security issue! I can imagine container runtimes
> > >> killing their monitoring process etc. pp. Not happening, unless someone
> > >> with deep expertise in signals can convince me otherwise.
> > >
> > > If parent namespace procfs FDs or mounts really can leak into child
> > > namespaces as easily as Aleksa says, then I don't mind adding the
> > > check. I was under the impression that if you find yourself in this
> > > situation, you already have a big problem.
> > 
> > There is one big reason to have the check, and I have not seen it
> > mentioned yet in this thread.
> > 
> > When SI_USER is set we report the pid of the sender of the signal in
> > si_pid.  When the signal comes from the kernel si_pid == 0.  When signal
> > is sent from an ancestor pid namespace si_pid also equals 0 (which is
> > reasonable).
> > 
> > A signal out to a process in a parent pid namespace such as SIGCHLD is
> > reasonable as we can map the pid.  I really don't see the point of
> > forbidding that.  From the perspective of the process in the parent pid
> > namespace it is just another process in it's pid namespace.  So it
> > should pose no problem from the perspective of the receiving process.
> > 
> > A signal to a process in a pid namespace that is neither a parent nor a
> > descendent pid namespace would be a problem, as there is no well defined
> > notion of what si_pid should be set to.  So for that case perhaps we
> > should have something like a noprocess pid that we can set.  Perhaps we
> > could set si_pid to 0x.  That would take a small extension to
> > pid_nr_ns.
> > 
> > File descriptors are not namespaced.  It is completely legitimate to use
> > file descriptors to get around limitations of namespaces.
> 
> Frankly, I don't see a good argument for why we would allow that even if
> safe. I have not heard a legitimate use-case or need for this.
> At this point I care about very simple semantics. Being able to signal
> into ancestor pid namespaces and cousin namespaces is interesting but
> makes the syscall more brittle and harder to understand.

Yeah, I'm with you on that.  We can always open that door later if a good
use case comes up, but I prefer simple at first.

> Changing pid_nr_ns() might be the solution but this function is called
> all over the place in the kernel and I'm not going to risk breaking
> something by changing it for a feature that no one so far has ever
> asked for.
> If you are ok with this then we should hold off on this. We can always
> add this feature later by removing the check when someone has a use-case
> for it.
> I'll send a v2 of the patch that keeps the restriction for now. If you
> insist on it being removed we can make the change in a follow-up
> iteration.
> 
> Christian
> 
> > 
> > Adding limitations to a file descriptor based api because someone else
> > can't set up their processes in such a way as to get the restrictions
> > they are looking for seems very sad.
> > 
> > Frankly I think it is one of the better features of namespaces that we
> > have to carefully handle and define these cases so that when the
> > inevitable 

Re: [RFC PATCH] Implement /proc/pid/kill

2018-11-02 Thread Serge E. Hallyn
Quoting Christian Brauner (christian.brau...@canonical.com):
> On Thu, Nov 01, 2018 at 01:40:59PM -0700, Joel Fernandes wrote:
> > On Tue, Oct 30, 2018 at 09:24:00PM -0700, Joel Fernandes wrote:
> > > On Tue, Oct 30, 2018 at 7:56 PM, Aleksa Sarai  wrote:
> > > > On 2018-10-31, Christian Brauner  
> > > > wrote:
> > > >> > I think Aleksa's larger point is that it's useful to treat processes
> > > >> > as other file-descriptor-named, poll-able, wait-able resources.
> > > >> > Consistency is important. A process is just another system resource,
> > > >> > and like any other system resource, you should be open to hold a file
> > > >> > descriptor to it and do things to that process via that file
> > > >> > descriptor. The precise form of this process-handle FD is up for
> > > >> > debate. The existing /proc/$PID directory FD is a good candidate for 
> > > >> > a
> > > >> > process handle FD, since it does almost all of what's needed. But
> > > >> > regardless of what form a process handle FD takes, we need it. I 
> > > >> > don't
> > > >> > see a case for continuing to treat processes in a non-unixy,
> > > >> > non-file-descriptor-based manner.
> > > >>
> > > >> That's what I'm proposing in the API for which I'm gathering feedback.
> > > >> I have presented parts of this in various discussions at LSS Europe 
> > > >> last week
> > > >> and will be at LPC.
> > > >> We don't want to rush an API like this though. It was tried before in
> > > >> other forms
> > > >> and these proposals didn't make it.
> > > >
> > > > :+1: on a well thought-out and generic proposal. As we've discussed
> > > > elsewhere, this is an issue that really would be great to (finally)
> > > > solve.
> > > 
> > > Excited to see this and please count me in for discussions around this. 
> > > thanks.
> > > 
> > 
> > Just a quick question, is there a track planned at LPC for discussing this
> > new proposal or topics around/related to the proposal?
> > 
> > If not, should that be planned?
> 
> There isn't currently one planned but I'm happy to have a hallway track
> session around this.
> 
> But note, I think not all relevant people are going to be there (e.g.
> Andy). File descriptors for processes seems interesting to a lot of
> people so I'm going to send out a pitch of the idea I have and see how
> much I'm going to get yelled at latest on Tuesday. Even if it just
> triggers a design discussion.
> I have been urged by people I pitched this to to send it to lkml
> already. Sorry for the delay and the initial non-transparency. The only
> reason I didn't do it right away was to ensure that this idea is not
> completely crazy. :) (Eric probably still thinks I am though. :))
> It's just that I'm at a conference and I want to have a nicer writeup of
> this. Given the speed with which this is all coming I have given up on
> preparing a first set of patches. :)
> 
> Christian

Sounds good, thanks, looking forward to it.


Re: [RFC PATCH] Implement /proc/pid/kill

2018-11-02 Thread Serge E. Hallyn
Quoting Christian Brauner (christian.brau...@canonical.com):
> On Thu, Nov 01, 2018 at 01:40:59PM -0700, Joel Fernandes wrote:
> > On Tue, Oct 30, 2018 at 09:24:00PM -0700, Joel Fernandes wrote:
> > > On Tue, Oct 30, 2018 at 7:56 PM, Aleksa Sarai  wrote:
> > > > On 2018-10-31, Christian Brauner  
> > > > wrote:
> > > >> > I think Aleksa's larger point is that it's useful to treat processes
> > > >> > as other file-descriptor-named, poll-able, wait-able resources.
> > > >> > Consistency is important. A process is just another system resource,
> > > >> > and like any other system resource, you should be open to hold a file
> > > >> > descriptor to it and do things to that process via that file
> > > >> > descriptor. The precise form of this process-handle FD is up for
> > > >> > debate. The existing /proc/$PID directory FD is a good candidate for 
> > > >> > a
> > > >> > process handle FD, since it does almost all of what's needed. But
> > > >> > regardless of what form a process handle FD takes, we need it. I 
> > > >> > don't
> > > >> > see a case for continuing to treat processes in a non-unixy,
> > > >> > non-file-descriptor-based manner.
> > > >>
> > > >> That's what I'm proposing in the API for which I'm gathering feedback.
> > > >> I have presented parts of this in various discussions at LSS Europe 
> > > >> last week
> > > >> and will be at LPC.
> > > >> We don't want to rush an API like this though. It was tried before in
> > > >> other forms
> > > >> and these proposals didn't make it.
> > > >
> > > > :+1: on a well thought-out and generic proposal. As we've discussed
> > > > elsewhere, this is an issue that really would be great to (finally)
> > > > solve.
> > > 
> > > Excited to see this and please count me in for discussions around this. 
> > > thanks.
> > > 
> > 
> > Just a quick question, is there a track planned at LPC for discussing this
> > new proposal or topics around/related to the proposal?
> > 
> > If not, should that be planned?
> 
> There isn't currently one planned but I'm happy to have a hallway track
> session around this.
> 
> But note, I think not all relevant people are going to be there (e.g.
> Andy). File descriptors for processes seems interesting to a lot of
> people so I'm going to send out a pitch of the idea I have and see how
> much I'm going to get yelled at latest on Tuesday. Even if it just
> triggers a design discussion.
> I have been urged by people I pitched this to to send it to lkml
> already. Sorry for the delay and the initial non-transparency. The only
> reason I didn't do it right away was to ensure that this idea is not
> completely crazy. :) (Eric probably still thinks I am though. :))
> It's just that I'm at a conference and I want to have a nicer writeup of
> this. Given the speed with which this is all coming I have given up on
> preparing a first set of patches. :)
> 
> Christian

Sounds good, thanks, looking forward to it.


Re: [PATCH v8 2/2] samples: add an example of seccomp user trap

2018-10-29 Thread Serge E. Hallyn
On Mon, Oct 29, 2018 at 04:40:31PM -0600, Tycho Andersen wrote:
> The idea here is just to give a demonstration of how one could safely use
> the SECCOMP_RET_USER_NOTIF feature to do mount policies. This particular
> policy is (as noted in the comment) not very interesting, but it serves to
> illustrate how one might apply a policy dodging the various TOCTOU issues.
> 
> Signed-off-by: Tycho Andersen 
> CC: Kees Cook 
> CC: Andy Lutomirski 
> CC: Oleg Nesterov 
> CC: Eric W. Biederman 
> CC: "Serge E. Hallyn" 
> CC: Christian Brauner 
> CC: Tyler Hicks 
> CC: Akihiro Suda 
> ---
> v5: new in v5
> v7: updates for v7 API changes
> v8: * add some more comments about what's happening in main() (Kees)
> * move from ptrace API to SECCOMP_FILTER_FLAG_NEW_LISTENER
> ---
>  samples/seccomp/.gitignore  |   1 +
>  samples/seccomp/Makefile|   7 +-
>  samples/seccomp/user-trap.c | 345 
>  3 files changed, 352 insertions(+), 1 deletion(-)
> 
> diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore
> index 78fb78184291..d1e2e817d556 100644
> --- a/samples/seccomp/.gitignore
> +++ b/samples/seccomp/.gitignore
> @@ -1,3 +1,4 @@
>  bpf-direct
>  bpf-fancy
>  dropper
> +user-trap
> diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
> index cf34ff6b4065..4920903c8009 100644
> --- a/samples/seccomp/Makefile
> +++ b/samples/seccomp/Makefile
> @@ -1,6 +1,6 @@
>  # SPDX-License-Identifier: GPL-2.0
>  ifndef CROSS_COMPILE
> -hostprogs-$(CONFIG_SAMPLE_SECCOMP) := bpf-fancy dropper bpf-direct
> +hostprogs-$(CONFIG_SAMPLE_SECCOMP) := bpf-fancy dropper bpf-direct user-trap
>  
>  HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include
>  HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include
> @@ -16,6 +16,10 @@ HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include
>  HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include
>  bpf-direct-objs := bpf-direct.o
>  
> +HOSTCFLAGS_user-trap.o += -I$(objtree)/usr/include
> +HOSTCFLAGS_user-trap.o += -idirafter $(objtree)/include
> +user-trap-objs := user-trap.o
> +
>  # Try to match the kernel target.
>  ifndef CONFIG_64BIT
>  
> @@ -33,6 +37,7 @@ HOSTCFLAGS_bpf-fancy.o += $(MFLAG)
>  HOSTLDLIBS_bpf-direct += $(MFLAG)
>  HOSTLDLIBS_bpf-fancy += $(MFLAG)
>  HOSTLDLIBS_dropper += $(MFLAG)
> +HOSTLDLIBS_user-trap += $(MFLAG)
>  endif
>  always := $(hostprogs-m)
>  endif
> diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
> new file mode 100644
> index ..bba7ac803c6c
> --- /dev/null
> +++ b/samples/seccomp/user-trap.c
> @@ -0,0 +1,345 @@
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
> +
> +static int seccomp(unsigned int op, unsigned int flags, void *args)
> +{
> + errno = 0;
> + return syscall(__NR_seccomp, op, flags, args);
> +}
> +
> +static int send_fd(int sock, int fd)
> +{
> + struct msghdr msg = {};
> + struct cmsghdr *cmsg;
> + char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
> + struct iovec io = {
> + .iov_base = ,
> + .iov_len = 1,
> + };
> +
> + msg.msg_iov = 
> + msg.msg_iovlen = 1;
> + msg.msg_control = buf;
> + msg.msg_controllen = sizeof(buf);
> + cmsg = CMSG_FIRSTHDR();
> + cmsg->cmsg_level = SOL_SOCKET;
> + cmsg->cmsg_type = SCM_RIGHTS;
> + cmsg->cmsg_len = CMSG_LEN(sizeof(int));
> + *((int *)CMSG_DATA(cmsg)) = fd;
> + msg.msg_controllen = cmsg->cmsg_len;
> +
> + if (sendmsg(sock, , 0) < 0) {
> + perror("sendmsg");
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +static int recv_fd(int sock)
> +{
> + struct msghdr msg = {};
> + struct cmsghdr *cmsg;
> + char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
> + struct iovec io = {
> + .iov_base = ,
> + .iov_len = 1,
> + };
> +
> + msg.msg_iov = 
> + msg.msg_iovlen = 1;
> + msg.msg_control = buf;
> + msg.msg_controllen = sizeof(buf);
> +
> + if (recvmsg(sock, , 0) < 0) {
> + perror("recvmsg");
> + return -1;
> + }
> +
> + cmsg = CMSG_FIRSTHDR();
> +
> + return *((int *)CMSG_DATA(cmsg));
> +}
> +
> +static

Re: [PATCH v8 2/2] samples: add an example of seccomp user trap

2018-10-29 Thread Serge E. Hallyn
On Mon, Oct 29, 2018 at 04:40:31PM -0600, Tycho Andersen wrote:
> The idea here is just to give a demonstration of how one could safely use
> the SECCOMP_RET_USER_NOTIF feature to do mount policies. This particular
> policy is (as noted in the comment) not very interesting, but it serves to
> illustrate how one might apply a policy dodging the various TOCTOU issues.
> 
> Signed-off-by: Tycho Andersen 
> CC: Kees Cook 
> CC: Andy Lutomirski 
> CC: Oleg Nesterov 
> CC: Eric W. Biederman 
> CC: "Serge E. Hallyn" 
> CC: Christian Brauner 
> CC: Tyler Hicks 
> CC: Akihiro Suda 
> ---
> v5: new in v5
> v7: updates for v7 API changes
> v8: * add some more comments about what's happening in main() (Kees)
> * move from ptrace API to SECCOMP_FILTER_FLAG_NEW_LISTENER
> ---
>  samples/seccomp/.gitignore  |   1 +
>  samples/seccomp/Makefile|   7 +-
>  samples/seccomp/user-trap.c | 345 
>  3 files changed, 352 insertions(+), 1 deletion(-)
> 
> diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore
> index 78fb78184291..d1e2e817d556 100644
> --- a/samples/seccomp/.gitignore
> +++ b/samples/seccomp/.gitignore
> @@ -1,3 +1,4 @@
>  bpf-direct
>  bpf-fancy
>  dropper
> +user-trap
> diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
> index cf34ff6b4065..4920903c8009 100644
> --- a/samples/seccomp/Makefile
> +++ b/samples/seccomp/Makefile
> @@ -1,6 +1,6 @@
>  # SPDX-License-Identifier: GPL-2.0
>  ifndef CROSS_COMPILE
> -hostprogs-$(CONFIG_SAMPLE_SECCOMP) := bpf-fancy dropper bpf-direct
> +hostprogs-$(CONFIG_SAMPLE_SECCOMP) := bpf-fancy dropper bpf-direct user-trap
>  
>  HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include
>  HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include
> @@ -16,6 +16,10 @@ HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include
>  HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include
>  bpf-direct-objs := bpf-direct.o
>  
> +HOSTCFLAGS_user-trap.o += -I$(objtree)/usr/include
> +HOSTCFLAGS_user-trap.o += -idirafter $(objtree)/include
> +user-trap-objs := user-trap.o
> +
>  # Try to match the kernel target.
>  ifndef CONFIG_64BIT
>  
> @@ -33,6 +37,7 @@ HOSTCFLAGS_bpf-fancy.o += $(MFLAG)
>  HOSTLDLIBS_bpf-direct += $(MFLAG)
>  HOSTLDLIBS_bpf-fancy += $(MFLAG)
>  HOSTLDLIBS_dropper += $(MFLAG)
> +HOSTLDLIBS_user-trap += $(MFLAG)
>  endif
>  always := $(hostprogs-m)
>  endif
> diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
> new file mode 100644
> index ..bba7ac803c6c
> --- /dev/null
> +++ b/samples/seccomp/user-trap.c
> @@ -0,0 +1,345 @@
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
> +
> +static int seccomp(unsigned int op, unsigned int flags, void *args)
> +{
> + errno = 0;
> + return syscall(__NR_seccomp, op, flags, args);
> +}
> +
> +static int send_fd(int sock, int fd)
> +{
> + struct msghdr msg = {};
> + struct cmsghdr *cmsg;
> + char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
> + struct iovec io = {
> + .iov_base = ,
> + .iov_len = 1,
> + };
> +
> + msg.msg_iov = 
> + msg.msg_iovlen = 1;
> + msg.msg_control = buf;
> + msg.msg_controllen = sizeof(buf);
> + cmsg = CMSG_FIRSTHDR();
> + cmsg->cmsg_level = SOL_SOCKET;
> + cmsg->cmsg_type = SCM_RIGHTS;
> + cmsg->cmsg_len = CMSG_LEN(sizeof(int));
> + *((int *)CMSG_DATA(cmsg)) = fd;
> + msg.msg_controllen = cmsg->cmsg_len;
> +
> + if (sendmsg(sock, , 0) < 0) {
> + perror("sendmsg");
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +static int recv_fd(int sock)
> +{
> + struct msghdr msg = {};
> + struct cmsghdr *cmsg;
> + char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
> + struct iovec io = {
> + .iov_base = ,
> + .iov_len = 1,
> + };
> +
> + msg.msg_iov = 
> + msg.msg_iovlen = 1;
> + msg.msg_control = buf;
> + msg.msg_controllen = sizeof(buf);
> +
> + if (recvmsg(sock, , 0) < 0) {
> + perror("recvmsg");
> + return -1;
> + }
> +
> + cmsg = CMSG_FIRSTHDR();
> +
> + return *((int *)CMSG_DATA(cmsg));
> +}
> +
> +static

Re: [RFC v4 1/1] ns: add binfmt_misc to the user namespace

2018-10-06 Thread Serge E. Hallyn
On Sat, Oct 06, 2018 at 09:35:46PM +0200, Laurent Vivier wrote:
> This patch allows to have a different binfmt_misc configuration
> for each new user namespace. By default, the binfmt_misc configuration
> is the one of the previous level, but if the binfmt_misc filesystem is
> mounted in the new namespace a new empty binfmt instance is created and
> used in this namespace.
> 
> For instance, using "unshare" we can start a chroot of an another
> architecture and configure the binfmt_misc interpreter without being root
> to run the binaries in this chroot.
> 
> Signed-off-by: Laurent Vivier 

Hi,

quick question below,

> ---
>  fs/binfmt_misc.c   | 99 --
>  include/linux/user_namespace.h | 13 +
>  kernel/user.c  | 13 +
>  kernel/user_namespace.c|  7 +++
>  4 files changed, 104 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
> index aa4a7a23ff99..1beefafcb416 100644
> --- a/fs/binfmt_misc.c
> +++ b/fs/binfmt_misc.c
> @@ -38,9 +38,6 @@ enum {
>   VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
>  };
>  
> -static LIST_HEAD(entries);
> -static int enabled = 1;
> -
>  enum {Enabled, Magic};
>  #define MISC_FMT_PRESERVE_ARGV0 (1 << 31)
>  #define MISC_FMT_OPEN_BINARY (1 << 30)
> @@ -60,10 +57,7 @@ typedef struct {
>   struct file *interp_file;
>  } Node;
>  
> -static DEFINE_RWLOCK(entries_lock);
>  static struct file_system_type bm_fs_type;
> -static struct vfsmount *bm_mnt;
> -static int entry_count;
>  
>  /*
>   * Max length of the register string.  Determined by:
> @@ -80,18 +74,28 @@ static int entry_count;
>   */
>  #define MAX_REGISTER_LENGTH 1920
>  
> +static struct binfmt_namespace *binfmt_ns(struct user_namespace *ns)
> +{
> + while (ns) {
> + if (ns->binfmt_ns)
> + return ns->binfmt_ns;
> + ns = ns->parent;
> + }
> + return NULL;
> +}
> +
>  /*
>   * Check if we support the binfmt
>   * if we do, return the node, else NULL
>   * locking is done in load_misc_binary
>   */
> -static Node *check_file(struct linux_binprm *bprm)
> +static Node *check_file(struct binfmt_namespace *ns, struct linux_binprm 
> *bprm)
>  {
>   char *p = strrchr(bprm->interp, '.');
>   struct list_head *l;
>  
>   /* Walk all the registered handlers. */
> - list_for_each(l, ) {
> + list_for_each(l, >entries) {
>   Node *e = list_entry(l, Node, list);
>   char *s;
>   int j;
> @@ -133,17 +137,18 @@ static int load_misc_binary(struct linux_binprm *bprm)
>   struct file *interp_file = NULL;
>   int retval;
>   int fd_binary = -1;
> + struct binfmt_namespace *ns = binfmt_ns(current_user_ns());
>  
>   retval = -ENOEXEC;
> - if (!enabled)
> + if (!ns->enabled)
>   return retval;
>  
>   /* to keep locking time low, we copy the interpreter string */
> - read_lock(_lock);
> - fmt = check_file(bprm);
> + read_lock(>entries_lock);
> + fmt = check_file(ns, bprm);
>   if (fmt)
>   dget(fmt->dentry);
> - read_unlock(_lock);
> + read_unlock(>entries_lock);
>   if (!fmt)
>   return retval;
>  
> @@ -609,19 +614,19 @@ static void bm_evict_inode(struct inode *inode)
>   kfree(e);
>  }
>  
> -static void kill_node(Node *e)
> +static void kill_node(struct binfmt_namespace *ns, Node *e)
>  {
>   struct dentry *dentry;
>  
> - write_lock(_lock);
> + write_lock(>entries_lock);
>   list_del_init(>list);
> - write_unlock(_lock);
> + write_unlock(>entries_lock);
>  
>   dentry = e->dentry;
>   drop_nlink(d_inode(dentry));
>   d_drop(dentry);
>   dput(dentry);
> - simple_release_fs(_mnt, _count);
> + simple_release_fs(>bm_mnt, >entry_count);
>  }
>  
>  /* / */
> @@ -651,6 +656,9 @@ static ssize_t bm_entry_write(struct file *file, const 
> char __user *buffer,
>   struct dentry *root;
>   Node *e = file_inode(file)->i_private;
>   int res = parse_command(buffer, count);
> + struct binfmt_namespace *ns;
> +
> + ns = binfmt_ns(file->f_path.dentry->d_sb->s_user_ns);
>  
>   switch (res) {
>   case 1:
> @@ -667,7 +675,7 @@ static ssize_t bm_entry_write(struct file *file, const 
> char __user *buffer,
>   inode_lock(d_inode(root));
>  
>   if (!list_empty(>list))
> - kill_node(e);
> + kill_node(ns, e);
>  
>   inode_unlock(d_inode(root));
>   break;
> @@ -693,6 +701,7 @@ static ssize_t bm_register_write(struct file *file, const 
> char __user *buffer,
>   struct inode *inode;
>   struct super_block *sb = file_inode(file)->i_sb;
>   struct dentry *root = sb->s_root, *dentry;
> + struct binfmt_namespace *ns;
>   int err = 0;
>  
>   e = create_entry(buffer, count);
> @@ -716,7 +725,9 @@ static ssize_t 

  1   2   3   4   5   6   7   8   9   10   >