Re: [PATCH 2/2] security.capability: fix conversions on getxattr

2021-01-28 Thread Eric W. Biederman
Miklos Szeredi  writes:

> On Thu, Jan 28, 2021 at 9:24 PM Eric W. Biederman  
> wrote:
>
>> 
>> From our previous discussions I would also argue it would be good
>> if there was a bypass that skipped all conversions if the reader
>> and the filesystem are in the same user namespace.
>> 
>
> That's however just an optimization (AFAICS) that only makes sense if
> it helps a read world workload.   I'm not convinced that that's the
> case.

It is definitely a different issue.

>From previous conversations with Serge, there is a concern with a
sysadmin wanting to see what is actually on disk.  In case there are
bugs that care about the different layout.  Just passing everything
through when no translation is necessary will allow that kind of
diagnosis.

As your patch demonstrates we already have had bugs in this area
so being able to get at the raw data may help people if they get into a
situation where bugs matter.

Eric


Re: [PATCH 2/2] security.capability: fix conversions on getxattr

2021-01-28 Thread Eric W. Biederman
"Serge E. Hallyn"  writes:

> On Tue, Jan 19, 2021 at 07:34:49PM -0600, Eric W. Biederman wrote:
>> Miklos Szeredi  writes:
>> 
>> > If a capability is stored on disk in v2 format cap_inode_getsecurity() will
>> > currently return in v2 format unconditionally.
>> >
>> > This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid,
>> > and so the same conversions performed on it.
>> >
>> > If the rootid cannot be mapped v3 is returned unconverted.  Fix this so
>> > that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs
>> > user namespace in case of v2) cannot be mapped in the current user
>> > namespace.
>> 
>> This looks like a good cleanup.
>
> Sorry, I'm not following.  Why is this a good cleanup?  Why should
> the xattr be shown as faked v3 in this case?

If the reader is in _user_ns.  If the filesystem was mounted in a
user namespace.   Then the reader looses the information that the
capability xattr only applies to a subset of user namespaces.

A trivial place where this would be important is if userspace was to
copy the file and the associated  capability xattr to another
filesystem, that is mounted differently.



>From our previous discussions I would also argue it would be good
if there was a bypass that skipped all conversions if the reader
and the filesystem are in the same user namespace.



> A separate question below.
>
>> I do wonder how well this works with stacking.  In particular
>> ovl_xattr_set appears to call vfs_getxattr without overriding the creds.
>> What the purpose of that is I haven't quite figured out.  It looks like
>> it is just a probe to see if an xattr is present so maybe it is ok.
>> 
>> Acked-by: "Eric W. Biederman" 
>> 
>> >
>> > Signed-off-by: Miklos Szeredi 
>> > ---
>> >  security/commoncap.c | 67 
>> >  1 file changed, 43 insertions(+), 24 deletions(-)
>> >
>> > diff --git a/security/commoncap.c b/security/commoncap.c
>> > index baccd871..c9d99f8f4c82 100644
>> > --- a/security/commoncap.c
>> > +++ b/security/commoncap.c
>> > @@ -371,10 +371,11 @@ int cap_inode_getsecurity(struct inode *inode, const 
>> > char *name, void **buffer,
>> >  {
>> >int size, ret;
>> >kuid_t kroot;
>> > +  __le32 nsmagic, magic;
>> >uid_t root, mappedroot;
>> >char *tmpbuf = NULL;
>> >struct vfs_cap_data *cap;
>> > -  struct vfs_ns_cap_data *nscap;
>> > +  struct vfs_ns_cap_data *nscap = NULL;
>> >struct dentry *dentry;
>> >struct user_namespace *fs_ns;
>> >  
>> > @@ -396,46 +397,61 @@ int cap_inode_getsecurity(struct inode *inode, const 
>> > char *name, void **buffer,
>> >fs_ns = inode->i_sb->s_user_ns;
>> >cap = (struct vfs_cap_data *) tmpbuf;
>> >if (is_v2header((size_t) ret, cap)) {
>> > -  /* If this is sizeof(vfs_cap_data) then we're ok with the
>> > -   * on-disk value, so return that.  */
>> > -  if (alloc)
>> > -  *buffer = tmpbuf;
>> > -  else
>> > -  kfree(tmpbuf);
>> > -  return ret;
>> > -  } else if (!is_v3header((size_t) ret, cap)) {
>> > -  kfree(tmpbuf);
>> > -  return -EINVAL;
>> > +  root = 0;
>> > +  } else if (is_v3header((size_t) ret, cap)) {
>> > +  nscap = (struct vfs_ns_cap_data *) tmpbuf;
>> > +  root = le32_to_cpu(nscap->rootid);
>> > +  } else {
>> > +  size = -EINVAL;
>> > +  goto out_free;
>> >}
>> >  
>> > -  nscap = (struct vfs_ns_cap_data *) tmpbuf;
>> > -  root = le32_to_cpu(nscap->rootid);
>> >kroot = make_kuid(fs_ns, root);
>> >  
>> >/* If the root kuid maps to a valid uid in current ns, then return
>> > * this as a nscap. */
>> >mappedroot = from_kuid(current_user_ns(), kroot);
>> >if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
>> > +  size = sizeof(struct vfs_ns_cap_data);
>> >if (alloc) {
>> > -  *buffer = tmpbuf;
>> > +  if (!nscap) {
>> > +  /* v2 -> v3 conversion */
>> > +  nscap = kzalloc(size, GFP_ATOMIC);
>> > +  if (!nscap) {
>> > +

Re: [PATCH v2 7/7] fs: HKDF - remove duplicate memory clearing

2021-01-28 Thread Eric Biggers
On Sun, Jan 24, 2021 at 03:04:50PM +0100, Stephan Müller wrote:
> The clearing of the OKM memory buffer in case of an error is already
> performed by the HKDF implementation crypto_hkdf_expand. Thus, the
> code clearing is not needed any more in the file system code base.
> 
> Signed-off-by: Stephan Mueller 
> ---
>  fs/crypto/hkdf.c | 9 +++--
>  1 file changed, 3 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
> index ae236b42b1f0..c48dd8ca3a46 100644
> --- a/fs/crypto/hkdf.c
> +++ b/fs/crypto/hkdf.c
> @@ -102,13 +102,10 @@ int fscrypt_hkdf_expand(const struct fscrypt_hkdf 
> *hkdf, u8 context,
>   .iov_base = (u8 *)info,
>   .iov_len = infolen,
>   } };
> - int err = crypto_hkdf_expand(hkdf->hmac_tfm,
> -  info_iov, ARRAY_SIZE(info_iov),
> -  okm, okmlen);
>  
> - if (unlikely(err))
> - memzero_explicit(okm, okmlen); /* so caller doesn't need to */
> - return err;
> + return crypto_hkdf_expand(hkdf->hmac_tfm,
> +   info_iov, ARRAY_SIZE(info_iov),
> +   okm, okmlen);
>  }
>  

Shoudn't this just be folded into the previous patch, which converted
fscrypt_hkdf_expand() to use crypto_hkdf_expand() in the first place?

- Eric


Re: [PATCH v2 6/7] fs: use HKDF implementation from kernel crypto API

2021-01-28 Thread Eric Biggers
On Sun, Jan 24, 2021 at 03:04:31PM +0100, Stephan Müller wrote:
> @@ -74,16 +57,14 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 
> *master_key,
>   return PTR_ERR(hmac_tfm);
>   }
>  
> - if (WARN_ON(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) {
> + if (WARN_ON(crypto_shash_digestsize(hmac_tfm) != HKDF_HASHLEN)) {
>   err = -EINVAL;
>   goto err_free_tfm;
>   }
>  
> - err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk);
> - if (err)
> - goto err_free_tfm;
> -
> - err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk));
> + /* HKDF-Extract (RFC 5869 section 2.2), unsalted */
> + err = crypto_hkdf_extract(hmac_tfm, NULL, 0,
> +   master_key, master_key_size);
>   if (err)
>   goto err_free_tfm;
>  
> @@ -93,7 +74,6 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 
> *master_key,
>  err_free_tfm:
>   crypto_free_shash(hmac_tfm);
>  out:
> - memzero_explicit(prk, sizeof(prk));
>   return err;
>  }

The 'out' label isn't needed anymore.  'goto out' should be replaced with
'return 0'.

- Eric


Re: [PATCH v2 6/7] fs: use HKDF implementation from kernel crypto API

2021-01-28 Thread Eric Biggers
Please prefix the commit subject with "fscrypt: " rather than "fs: ".

On Sun, Jan 24, 2021 at 03:04:31PM +0100, Stephan Müller wrote:
> diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
> index e0ec21055505..ae236b42b1f0 100644
> --- a/fs/crypto/hkdf.c
> +++ b/fs/crypto/hkdf.c
> @@ -9,7 +9,7 @@
>   * Copyright 2019 Google LLC
>   */
>  
> -#include 
> +#include 
>  #include 
>  
>  #include "fscrypt_private.h"
> @@ -37,23 +37,7 @@
>   * unnecessarily long master keys.  Thus fscrypt still does HKDF-Extract.  No
>   * salt is used, since fscrypt master keys should already be pseudorandom and
>   * there's no way to persist a random salt per master key from kernel mode.
> - */
> -
> -/* HKDF-Extract (RFC 5869 section 2.2), unsalted */
> -static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm,
> - unsigned int ikmlen, u8 prk[HKDF_HASHLEN])
> -{
> - static const u8 default_salt[HKDF_HASHLEN];
> - int err;
> -
> - err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN);
> - if (err)
> - return err;
> -
> - return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk);
> -}
> -
> -/*
> + *
>   * Compute HKDF-Extract using the given master key as the input keying 
> material,
>   * and prepare an HMAC transform object keyed by the resulting pseudorandom 
> key.
>   *

I don't think this comment should be joined with the one above it.  The earlier
comment describes the general approach taken with fscrypt and HKDF (including
all steps), while the one beginning with "Compute HKDF-Extract" describes
fscrypt_init_hkdf() specifically.

- Eric


Re: [PATCH v2 3/7] crypto: add RFC5869 HKDF

2021-01-28 Thread Eric Biggers
   shash_desc_zero(desc);
> + return err;
> +}
> +EXPORT_SYMBOL(crypto_hkdf_expand);

EXPORT_SYMBOL_GPL?

> +
> +/*
> + * HKDF extract phase.
> + */
> +int crypto_hkdf_extract(struct crypto_shash *kmd,
> + const u8 *salt, size_t saltlen,
> + const u8 *ikm, size_t ikmlen)
> +{
> + SHASH_DESC_ON_STACK(desc, kmd);
> + unsigned int h = crypto_shash_digestsize(kmd);
> + int err;
> + static const u8 null_salt[HASH_MAX_DIGESTSIZE] = { 0 };
> + u8 prk[HASH_MAX_DIGESTSIZE];
> +
> + desc->tfm = kmd;
> +
> + if (salt && saltlen) {

Checking 'salt && saltlen' like this is poor practice, as then if people
accidentally use salt == NULL && saltlen != 0 when they intended to use a salt,
the bug won't be detected.  Just doing 'if (saltlen)' would be better.

> +
> + /* Extract the PRK */
> + err = crypto_shash_init(desc);
> + if (err)
> + goto err;
> +
> + err = crypto_shash_finup(desc, ikm, ikmlen, prk);
> + if (err)
> + goto err;

This should use crypto_shash_digest() instead of crypto_shash_init() +
crypto_shash_finup().

> +/* Test vectors from RFC 5869 appendix A */
> +static const struct kdf_testvec hkdf_hmac_sha256_tv_template[] = {
> + {
> + /* salt */
> + .key = "\x00\x01\x02\x03\x04\x05\x06\x07"
> +"\x08\x09\x0a\x0b\x0c",
> + .keylen  = 13,
> + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
> +"\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
> +"\x0b\x0b\x0b\x0b\x0b\x0b",
> + .ikmlen = 22,
> + .info = {
> + .iov_base = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
> + "\xf8\xf9",
> + .iov_len  = 10
> + },
> + .expected = "\x3c\xb2\x5f\x25\xfa\xac\xd5\x7a"
> + "\x90\x43\x4f\x64\xd0\x36\x2f\x2a"
> + "\x2d\x2d\x0a\x90\xcf\x1a\x5a\x4c"
> + "\x5d\xb0\x2d\x56\xec\xc4\xc5\xbf"
> + "\x34\x00\x72\x08\xd5\xb8\x87\x18"
> + "\x58\x65",
> + .expectedlen  = 42
> + }, {
> + /* salt */
> + .key = NULL,
> + .keylen  = 0,
> + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
> +"\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
> +"\x0b\x0b\x0b\x0b\x0b\x0b",
> + .ikmlen  = 22,
> + .info = {
> + .iov_base = NULL,
> + .iov_len  = 0
> + },
> + .expected = "\x8d\xa4\xe7\x75\xa5\x63\xc1\x8f"
> + "\x71\x5f\x80\x2a\x06\x3c\x5a\x31"
> + "\xb8\xa1\x1f\x5c\x5e\xe1\x87\x9e"
> + "\xc3\x45\x4e\x5f\x3c\x73\x8d\x2d"
> + "\x9d\x20\x13\x95\xfa\xa4\xb6\x1a"
> + "\x96\xc8",
> + .expectedlen  = 42
> + }
> +};

The case of multiple entries in the 'info' kvec isn't being tested.

Also, it is confusing having both 'key' and 'ikm'.  'key' really should be
'salt'.

> diff --git a/include/crypto/hkdf.h b/include/crypto/hkdf.h
> new file mode 100644
> index ..c6989f786860
> --- /dev/null
> +++ b/include/crypto/hkdf.h
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * Copyright (C) 2020, Stephan Mueller 
> + */
> +
> +#ifndef _CRYPTO_HKDF_H
> +#define _CRYPTO_HKDF_H
> +
> +#include 
> +#include 
> +
> +/**
> + * RFC 5869 HKDF expand operation
> + *
> + * @kmd Keyed message digest whose key was set with crypto_hkdf_extract
> + * @info optional context and application specific information - this may be
> + *NULL
> + * @info_vec number of optional context/application specific information 
> entries
> + * @dst destination buffer that the caller already allocated
> + * @dlen length of the destination buffer - the KDF derives that amount of
> + *bytes.
> + *
> + * @return 0 on success, < 0 on error
> + */
> +int crypto_hkdf_expand(struct crypto_shash *kmd,
> +const struct kvec *info, unsigned int info_nvec,
> +u8 *dst, unsigned int dlen);
> +
> +/**
> + * RFC 5869 HKDF extract operation
> + *
> + * @kmd Keyed message digest allocated by the caller. The key should not have
> + *   been set.
> + * @salt The salt used for the KDF. It is permissible to provide NULL as salt
> + *which implies that the default salt is used.
> + * @saltlen Length of the salt buffer.
> + * @ikm The input key material (IKM). It is permissible to provide NULL as 
> IKM.
> + * @ikmlen Length of the IKM buffer
> + * @seed_nvec number of seed entries (must be at least 1)

seed_nvec no longer exists.

- Eric


Re: [PATCH v2 1/1] kexec: dump kmessage before machine_kexec

2021-01-28 Thread Eric W. Biederman
Pavel Tatashin  writes:

> kmsg_dump(KMSG_DUMP_SHUTDOWN) is called before
> machine_restart(), machine_halt(), machine_power_off(), the only one that
> is missing is  machine_kexec().
>
> The dmesg output that it contains can be used to study the shutdown
> performance of both kernel and systemd during kexec reboot.
>
> Here is example of dmesg data collected after kexec:

As long was we keep kmsg_dump out of the crash_kexec path where
it completely breaks kexec on panic this seems a reasonable thing to do.
On the ordinary kernel_kexec path everything is expected to be working.

Is kmsg_dump expected to work after all of the device drivers
are shut down?  Otherwise this placement of kmsg_dump is too late.

Eric

> root@dplat-cp22:~# cat /sys/fs/pstore/dmesg-ramoops-0 | tail
> ...
> <6>[   70.914592] psci: CPU3 killed (polled 0 ms)
> <5>[   70.915705] CPU4: shutdown
> <6>[   70.916643] psci: CPU4 killed (polled 4 ms)
> <5>[   70.917715] CPU5: shutdown
> <6>[   70.918725] psci: CPU5 killed (polled 0 ms)
> <5>[   70.919704] CPU6: shutdown
> <6>[   70.920726] psci: CPU6 killed (polled 4 ms)
> <5>[   70.921642] CPU7: shutdown
> <6>[   70.922650] psci: CPU7 killed (polled 0 ms)
>
> Signed-off-by: Pavel Tatashin 
> Reviewed-by: Kees Cook 
> Reviewed-by: Petr Mladek 
> Reviewed-by: Bhupesh Sharma 
> ---
>  kernel/kexec_core.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index 4f8efc278aa7..e253c8b59145 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -37,6 +37,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -1180,6 +1181,7 @@ int kernel_kexec(void)
>   machine_shutdown();
>   }
>  
> + kmsg_dump(KMSG_DUMP_SHUTDOWN);
>   machine_kexec(kexec_image);
>  
>  #ifdef CONFIG_KEXEC_JUMP


Re: [PATCH] netdevsim: init u64 stats for 32bit hardware

2021-01-28 Thread Eric Dumazet



On 1/28/21 8:23 AM, Dmitry Vyukov wrote:
> On Thu, Jan 28, 2021 at 3:43 AM Hillf Danton  wrote:
>>
>> Init the u64 stats in order to avoid the lockdep prints on the 32bit
>> hardware like
> 
> FTR this is not just to avoid lockdep prints, but also to prevent very
> real stalls in production.

Are you sure ?

> u64_stats_init initializes seqlock, if the uninitialized
> selock->sequence would be odd, the kernel will stall.

Normally the whole netdev structure is zeroed when allocated,
this is done in alloc_netdev_mqs()

p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);

So unless kvzalloc() has been changed recently to inject random data
instead of 0, this bug is really about lockdep only.

> 
> Maintainers, please send this upstream on your earliest convenience,
> this breaks all 32-bit arches for testing purposes.
> 
> Thanks
> 
>>  INFO: trying to register non-static key.
>>  the code is fine but needs lockdep annotation.
>>  turning off the locking correctness validator.
>>  CPU: 0 PID: 4695 Comm: syz-executor.0 Not tainted 5.11.0-rc5-syzkaller #0
>>  Hardware name: ARM-Versatile Express
>>  Backtrace:
>>  [<826fc5b8>] (dump_backtrace) from [<826fc82c>] (show_stack+0x18/0x1c 
>> arch/arm/kernel/traps.c:252)
>>  [<826fc814>] (show_stack) from [<8270d1f8>] (__dump_stack 
>> lib/dump_stack.c:79 [inline])
>>  [<826fc814>] (show_stack) from [<8270d1f8>] (dump_stack+0xa8/0xc8 
>> lib/dump_stack.c:120)
>>  [<8270d150>] (dump_stack) from [<802bf9c0>] (assign_lock_key 
>> kernel/locking/lockdep.c:935 [inline])
>>  [<8270d150>] (dump_stack) from [<802bf9c0>] 
>> (register_lock_class+0xabc/0xb68 kernel/locking/lockdep.c:1247)
>>  [<802bef04>] (register_lock_class) from [<802baa2c>] 
>> (__lock_acquire+0x84/0x32d4 kernel/locking/lockdep.c:4711)
>>  [<802ba9a8>] (__lock_acquire) from [<802be840>] 
>> (lock_acquire.part.0+0xf0/0x554 kernel/locking/lockdep.c:5442)
>>  [<802be750>] (lock_acquire.part.0) from [<802bed10>] 
>> (lock_acquire+0x6c/0x74 kernel/locking/lockdep.c:5415)
>>  [<802beca4>] (lock_acquire) from [<81560548>] 
>> (seqcount_lockdep_reader_access include/linux/seqlock.h:103 [inline])
>>  [<802beca4>] (lock_acquire) from [<81560548>] (__u64_stats_fetch_begin 
>> include/linux/u64_stats_sync.h:164 [inline])
>>  [<802beca4>] (lock_acquire) from [<81560548>] (u64_stats_fetch_begin 
>> include/linux/u64_stats_sync.h:175 [inline])
>>  [<802beca4>] (lock_acquire) from [<81560548>] (nsim_get_stats64+0xdc/0xf0 
>> drivers/net/netdevsim/netdev.c:70)
>>  [<8156046c>] (nsim_get_stats64) from [<81e2efa0>] (dev_get_stats+0x44/0xd0 
>> net/core/dev.c:10405)
>>  [<81e2ef5c>] (dev_get_stats) from [<81e53204>] (rtnl_fill_stats+0x38/0x120 
>> net/core/rtnetlink.c:1211)
>>  [<81e531cc>] (rtnl_fill_stats) from [<81e59d58>] 
>> (rtnl_fill_ifinfo+0x6d4/0x148c net/core/rtnetlink.c:1783)
>>  [<81e59684>] (rtnl_fill_ifinfo) from [<81e5ceb4>] 
>> (rtmsg_ifinfo_build_skb+0x9c/0x108 net/core/rtnetlink.c:3798)
>>  [<81e5ce18>] (rtmsg_ifinfo_build_skb) from [<81e5d0ac>] (rtmsg_ifinfo_event 
>> net/core/rtnetlink.c:3830 [inline])
>>  [<81e5ce18>] (rtmsg_ifinfo_build_skb) from [<81e5d0ac>] (rtmsg_ifinfo_event 
>> net/core/rtnetlink.c:3821 [inline])
>>  [<81e5ce18>] (rtmsg_ifinfo_build_skb) from [<81e5d0ac>] 
>> (rtmsg_ifinfo+0x44/0x70 net/core/rtnetlink.c:3839)
>>  [<81e5d068>] (rtmsg_ifinfo) from [<81e45c2c>] 
>> (register_netdevice+0x664/0x68c net/core/dev.c:10103)
>>  [<81e455c8>] (register_netdevice) from [<815608bc>] (nsim_create+0xf8/0x124 
>> drivers/net/netdevsim/netdev.c:317)
>>  [<815607c4>] (nsim_create) from [<81561184>] 
>> (__nsim_dev_port_add+0x108/0x188 drivers/net/netdevsim/dev.c:941)
>>  [<8156107c>] (__nsim_dev_port_add) from [<815620d8>] (nsim_dev_port_add_all 
>> drivers/net/netdevsim/dev.c:990 [inline])
>>  [<8156107c>] (__nsim_dev_port_add) from [<815620d8>] 
>> (nsim_dev_probe+0x5cc/0x750 drivers/net/netdevsim/dev.c:1119)
>>  [<81561b0c>] (nsim_dev_probe) from [<815661dc>] (nsim_bus_probe+0x10/0x14 
>> drivers/net/netdevsim/bus.c:287)
>>  [<815661cc>] (nsim_bus_probe) from [<811724c0>] (really_probe+0x100/0x50c 
>> drivers/base/dd.c:554)
>>  [<811723c0>] (really_probe) from [<811729c4>] 
>> (driver_probe_device+0xf8/0x1c8 drivers/base/dd.c:740)
>>  [<811728cc>] (driver_probe_device) from [<81172fe4>] 
>> (__device_attach_driver+0x8c/0xf0 drivers/base/dd.c:846)
>>  [<81172f58>] (__device_attach_driver) from [<8116fee0>] 
>> (bus_for_each_drv+0x88/0xd8 drivers/base/bus.c:431)
>>  [<8116fe58>] (bus_for_each_drv) from [<81172c6c>] 
>> (__device_attach+0xdc/0x1d0 drivers/base/dd.c:914)
>>  [<81172b90>] (__device_attach) from [<8117305c>] 
>> (device_initial_probe+0x14/0x18 drivers/base/dd.c:961)
>>  [<81173048>] (device_initial_probe) from [<81171358>] 
>> (bus_probe_device+0x90/0x98 drivers/base/bus.c:491)
>>  [<811712c8>] (bus_probe_device) from [<8116e77c>] (device_add+0x320/0x824 
>> drivers/base/core.c:3109)
>>  [<8116e45c>] (device_add) from [<8116ec9c>] (device_register+0x1c/0x20 
>> 

Re: [PATCH v5 0/4] Add EFI_CERT_X509_GUID support for dbx/mokx entries

2021-01-28 Thread Eric Snowberg


> On Jan 28, 2021, at 8:16 AM, David Howells  wrote:
> 
> Which tree do you envision this going through?  EFI or keyrings - or are you
> going to ask Linus to pull it directly?  I can pull it if it should go through
> the keyrings tree.

I was thinking it would go thru your tree, since a majority of the code
is contained within it.



Re: [PATCH v5 1/4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-27 Thread Eric Snowberg


> On Jan 27, 2021, at 8:54 PM, Nayna  wrote:
> 
> 
> On 1/22/21 1:10 PM, Eric Snowberg wrote:
>> This fixes CVE-2020-26541.
>> 
>> The Secure Boot Forbidden Signature Database, dbx, contains a list of now
>> revoked signatures and keys previously approved to boot with UEFI Secure
>> Boot enabled.  The dbx is capable of containing any number of
>> EFI_CERT_X509_SHA256_GUID, EFI_CERT_SHA256_GUID, and EFI_CERT_X509_GUID
>> entries.
>> 
>> Currently when EFI_CERT_X509_GUID are contained in the dbx, the entries are
>> skipped.
>> 
>> Add support for EFI_CERT_X509_GUID dbx entries. When a EFI_CERT_X509_GUID
>> is found, it is added as an asymmetrical key to the .blacklist keyring.
>> Anytime the .platform keyring is used, the keys in the .blacklist keyring
>> are referenced, if a matching key is found, the key will be rejected.
>> 
>> Signed-off-by: Eric Snowberg 
>> Reviewed-by: Jarkko Sakkinen 
>> Signed-off-by: David Howells 
>> ---
>> v5: Function name changes done by David Howells
>> ---
>>  certs/blacklist.c | 32 +++
>>  certs/blacklist.h | 12 +++
>>  certs/system_keyring.c|  6 
>>  include/keys/system_keyring.h | 11 +++
>>  .../platform_certs/keyring_handler.c  | 11 +++
>>  5 files changed, 72 insertions(+)
>> 
>> diff --git a/certs/blacklist.c b/certs/blacklist.c
>> index 6514f9ebc943..a7f021878a4b 100644
>> --- a/certs/blacklist.c
>> +++ b/certs/blacklist.c
>> @@ -100,6 +100,38 @@ int mark_hash_blacklisted(const char *hash)
>>  return 0;
>>  }
>> 
>> +int add_key_to_revocation_list(const char *data, size_t size)
>> +{
>> +key_ref_t key;
>> +
>> +key = key_create_or_update(make_key_ref(blacklist_keyring, true),
>> +   "asymmetric",
>> +   NULL,
>> +   data,
>> +   size,
>> +   ((KEY_POS_ALL & ~KEY_POS_SETATTR) | 
>> KEY_USR_VIEW),
>> +   KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN);
>> +
>> +if (IS_ERR(key)) {
>> +pr_err("Problem with revocation key (%ld)\n", PTR_ERR(key));
>> +return PTR_ERR(key);
>> +}
>> +
>> +return 0;
>> +}
>> +
>> +int is_key_on_revocation_list(struct pkcs7_message *pkcs7)
>> +{
>> +int ret;
>> +
>> +ret = validate_trust(pkcs7, blacklist_keyring);
>> +
>> +if (ret == 0)
>> +return -EKEYREJECTED;
>> +
>> +return -ENOKEY;
>> +}
>> +
>>  /**
>>   * is_hash_blacklisted - Determine if a hash is blacklisted
>>   * @hash: The hash to be checked as a binary blob
>> diff --git a/certs/blacklist.h b/certs/blacklist.h
>> index 1efd6fa0dc60..420bb7c86e07 100644
>> --- a/certs/blacklist.h
>> +++ b/certs/blacklist.h
>> @@ -1,3 +1,15 @@
>>  #include 
>> +#include 
>> +#include 
>> 
>>  extern const char __initconst *const blacklist_hashes[];
>> +
>> +#ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING
>> +#define validate_trust pkcs7_validate_trust
>> +#else
>> +static inline int validate_trust(struct pkcs7_message *pkcs7,
>> + struct key *trust_keyring)
>> +{
>> +return -ENOKEY;
>> +}
>> +#endif
>> diff --git a/certs/system_keyring.c b/certs/system_keyring.c
>> index 798291177186..cc165b359ea3 100644
>> --- a/certs/system_keyring.c
>> +++ b/certs/system_keyring.c
>> @@ -241,6 +241,12 @@ int verify_pkcs7_message_sig(const void *data, size_t 
>> len,
>>  pr_devel("PKCS#7 platform keyring is not available\n");
>>  goto error;
>>  }
>> +
>> +ret = is_key_on_revocation_list(pkcs7);
>> +if (ret != -ENOKEY) {
>> +pr_devel("PKCS#7 platform key is on revocation list\n");
>> +goto error;
>> +}
>>  }
>>  ret = pkcs7_validate_trust(pkcs7, trusted_keys);
>>  if (ret < 0) {
>> diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
>> index fb8b07daa9d1..61f98739e8b1 100644
>> --- a/include/keys/system_keyring.h
>> +++ b/include/keys/system_keyring.h
>> @@ -31,11 +31,14 @@ extern int 
>> restrict_link_by_builti

[PATCH 3/3 v2] drm/msm: Clean up GMU OOB set/clear handling.

2021-01-27 Thread Eric Anholt
Now that the bug is fixed in the minimal way for stable, go make the
code table-driven.

Signed-off-by: Eric Anholt 
---

Previous version hadn't been rebased off of a bit of debug code I had,
so it wouldn't cleanly apply.

 drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 124 +-
 drivers/gpu/drm/msm/adreno/a6xx_gmu.h |  55 
 2 files changed, 77 insertions(+), 102 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
index b3318f86aabc..9066e98eb8ef 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
@@ -245,47 +245,66 @@ static int a6xx_gmu_hfi_start(struct a6xx_gmu *gmu)
return ret;
 }
 
+struct a6xx_gmu_oob_bits {
+   int set, ack, set_new, ack_new;
+   const char *name;
+};
+
+/* These are the interrupt / ack bits for each OOB request that are set
+ * in a6xx_gmu_set_oob and a6xx_clear_oob
+ */
+static const struct a6xx_gmu_oob_bits a6xx_gmu_oob_bits[] = {
+   [GMU_OOB_GPU_SET] = {
+   .name = "GPU_SET",
+   .set = 16,
+   .ack = 24,
+   .set_new = 30,
+   .ack_new = 31,
+   },
+
+   [GMU_OOB_PERFCOUNTER_SET] = {
+   .name = "PERFCOUNTER",
+   .set = 17,
+   .ack = 25,
+   .set_new = 28,
+   .ack_new = 30,
+   },
+
+   [GMU_OOB_BOOT_SLUMBER] = {
+   .name = "BOOT_SLUMBER",
+   .set = 22,
+   .ack = 30,
+   },
+
+   [GMU_OOB_DCVS_SET] = {
+   .name = "GPU_DCVS",
+   .set = 23,
+   .ack = 31,
+   },
+};
+
 /* Trigger a OOB (out of band) request to the GMU */
 int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
 {
int ret;
u32 val;
int request, ack;
-   const char *name;
 
-   switch (state) {
-   case GMU_OOB_GPU_SET:
-   if (gmu->legacy) {
-   request = GMU_OOB_GPU_SET_REQUEST;
-   ack = GMU_OOB_GPU_SET_ACK;
-   } else {
-   request = GMU_OOB_GPU_SET_REQUEST_NEW;
-   ack = GMU_OOB_GPU_SET_ACK_NEW;
-   }
-   name = "GPU_SET";
-   break;
-   case GMU_OOB_PERFCOUNTER_SET:
-   if (gmu->legacy) {
-   request = GMU_OOB_PERFCOUNTER_REQUEST;
-   ack = GMU_OOB_PERFCOUNTER_ACK;
-   } else {
-   request = GMU_OOB_PERFCOUNTER_REQUEST_NEW;
-   ack = GMU_OOB_PERFCOUNTER_ACK_NEW;
-   }
-   name = "PERFCOUNTER";
-   break;
-   case GMU_OOB_BOOT_SLUMBER:
-   request = GMU_OOB_BOOT_SLUMBER_REQUEST;
-   ack = GMU_OOB_BOOT_SLUMBER_ACK;
-   name = "BOOT_SLUMBER";
-   break;
-   case GMU_OOB_DCVS_SET:
-   request = GMU_OOB_DCVS_REQUEST;
-   ack = GMU_OOB_DCVS_ACK;
-   name = "GPU_DCVS";
-   break;
-   default:
+   if (state >= ARRAY_SIZE(a6xx_gmu_oob_bits))
return -EINVAL;
+
+   if (gmu->legacy) {
+   request = a6xx_gmu_oob_bits[state].set;
+   ack = a6xx_gmu_oob_bits[state].ack;
+   } else {
+   request = a6xx_gmu_oob_bits[state].set_new;
+   ack = a6xx_gmu_oob_bits[state].ack_new;
+   if (!request || !ack) {
+   DRM_DEV_ERROR(gmu->dev,
+ "Invalid non-legacy GMU request %s\n",
+ a6xx_gmu_oob_bits[state].name);
+   return -EINVAL;
+   }
}
 
/* Trigger the equested OOB operation */
@@ -298,7 +317,7 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum 
a6xx_gmu_oob_state state)
if (ret)
DRM_DEV_ERROR(gmu->dev,
"Timeout waiting for GMU OOB set %s: 0x%x\n",
-   name,
+   a6xx_gmu_oob_bits[state].name,
gmu_read(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO));
 
/* Clear the acknowledge interrupt */
@@ -310,36 +329,17 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum 
a6xx_gmu_oob_state state)
 /* Clear a pending OOB state in the GMU */
 void a6xx_gmu_clear_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
 {
-   if (!gmu->legacy) {
-   if (state == GMU_OOB_GPU_SET) {
-   gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET,
-   1 << GMU_OOB_GPU_SET_CLEAR_NEW);
-   } else {
-   WARN_ON(state != GMU_OOB_PERFCOUNTER_SET);
- 

[PATCH 2/3] drm/msm: Fix races managing the OOB state for timestamp vs timestamps.

2021-01-27 Thread Eric Anholt
Now that we're not racing with GPU setup, also fix races of timestamps
against other timestamps.  In CI, we were seeing this path trigger
timeouts on setting the GMU bit, especially on the first set of tests
right after boot (it's probably easier to lose the race than one might
think, given that we start many tests in parallel, and waiting for NFS
to page in code probably means that lots of tests hit the same point
of screen init at the same time).

Signed-off-by: Eric Anholt 
Cc: sta...@vger.kernel.org # v5.9
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 7424a70b9d35..e8f0b5325a7f 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1175,6 +1175,9 @@ static int a6xx_get_timestamp(struct msm_gpu *gpu, 
uint64_t *value)
 {
struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
+   static DEFINE_MUTEX(perfcounter_oob);
+
+   mutex_lock(_oob);
 
/* Force the GPU power on so we can read this register */
a6xx_gmu_set_oob(_gpu->gmu, GMU_OOB_PERFCOUNTER_SET);
@@ -1183,6 +1186,7 @@ static int a6xx_get_timestamp(struct msm_gpu *gpu, 
uint64_t *value)
REG_A6XX_RBBM_PERFCTR_CP_0_HI);
 
a6xx_gmu_clear_oob(_gpu->gmu, GMU_OOB_PERFCOUNTER_SET);
+   mutex_unlock(_oob);
return 0;
 }
 
-- 
2.30.0



[PATCH 3/3] drm/msm: Clean up GMU OOB set/clear handling.

2021-01-27 Thread Eric Anholt
Now that the bug is fixed in the minimal way for stable, go make the
code table-driven.

Signed-off-by: Eric Anholt 
---
 drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 124 +-
 drivers/gpu/drm/msm/adreno/a6xx_gmu.h |  55 
 2 files changed, 77 insertions(+), 102 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
index 378dc7f190c3..c497e0942141 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
@@ -245,47 +245,66 @@ static int a6xx_gmu_hfi_start(struct a6xx_gmu *gmu)
return ret;
 }
 
+struct a6xx_gmu_oob_bits {
+   int set, ack, set_new, ack_new;
+   const char *name;
+};
+
+/* These are the interrupt / ack bits for each OOB request that are set
+ * in a6xx_gmu_set_oob and a6xx_clear_oob
+ */
+static const struct a6xx_gmu_oob_bits a6xx_gmu_oob_bits[] = {
+   [GMU_OOB_GPU_SET] = {
+   .name = "GPU_SET",
+   .set = 16,
+   .ack = 24,
+   .set_new = 30,
+   .ack_new = 31,
+   },
+
+   [GMU_OOB_PERFCOUNTER_SET] = {
+   .name = "PERFCOUNTER",
+   .set = 17,
+   .ack = 25,
+   .set_new = 28,
+   .ack_new = 30,
+   },
+
+   [GMU_OOB_BOOT_SLUMBER] = {
+   .name = "BOOT_SLUMBER",
+   .set = 22,
+   .ack = 30,
+   },
+
+   [GMU_OOB_DCVS_SET] = {
+   .name = "GPU_DCVS",
+   .set = 23,
+   .ack = 31,
+   },
+};
+
 /* Trigger a OOB (out of band) request to the GMU */
 int _a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state, 
char *file, int line)
 {
int ret;
u32 val;
int request, ack;
-   const char *name;
 
-   switch (state) {
-   case GMU_OOB_GPU_SET:
-   if (gmu->legacy) {
-   request = GMU_OOB_GPU_SET_REQUEST;
-   ack = GMU_OOB_GPU_SET_ACK;
-   } else {
-   request = GMU_OOB_GPU_SET_REQUEST_NEW;
-   ack = GMU_OOB_GPU_SET_ACK_NEW;
-   }
-   name = "GPU_SET";
-   break;
-   case GMU_OOB_PERFCOUNTER_SET:
-   if (gmu->legacy) {
-   request = GMU_OOB_PERFCOUNTER_REQUEST;
-   ack = GMU_OOB_PERFCOUNTER_ACK;
-   } else {
-   request = GMU_OOB_PERFCOUNTER_REQUEST_NEW;
-   ack = GMU_OOB_PERFCOUNTER_ACK_NEW;
-   }
-   name = "PERFCOUNTER";
-   break;
-   case GMU_OOB_BOOT_SLUMBER:
-   request = GMU_OOB_BOOT_SLUMBER_REQUEST;
-   ack = GMU_OOB_BOOT_SLUMBER_ACK;
-   name = "BOOT_SLUMBER";
-   break;
-   case GMU_OOB_DCVS_SET:
-   request = GMU_OOB_DCVS_REQUEST;
-   ack = GMU_OOB_DCVS_ACK;
-   name = "GPU_DCVS";
-   break;
-   default:
+   if (state >= ARRAY_SIZE(a6xx_gmu_oob_bits))
return -EINVAL;
+
+   if (gmu->legacy) {
+   request = a6xx_gmu_oob_bits[state].set;
+   ack = a6xx_gmu_oob_bits[state].ack;
+   } else {
+   request = a6xx_gmu_oob_bits[state].set_new;
+   ack = a6xx_gmu_oob_bits[state].ack_new;
+   if (!request || !ack) {
+   DRM_DEV_ERROR(gmu->dev,
+ "Invalid non-legacy GMU request %s\n",
+ a6xx_gmu_oob_bits[state].name);
+   return -EINVAL;
+   }
}
 
/* Trigger the equested OOB operation */
@@ -299,7 +318,7 @@ int _a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum 
a6xx_gmu_oob_state state, char
DRM_DEV_ERROR(gmu->dev,
"%s:%d Timeout waiting for GMU OOB set %s: 0x%x\n",
file, line,
-   name,
+   a6xx_gmu_oob_bits[state].name,
gmu_read(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO));
 
/* Clear the acknowledge interrupt */
@@ -311,36 +330,17 @@ int _a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum 
a6xx_gmu_oob_state state, char
 /* Clear a pending OOB state in the GMU */
 void a6xx_gmu_clear_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
 {
-   if (!gmu->legacy) {
-   if (state == GMU_OOB_GPU_SET) {
-   gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET,
-   1 << GMU_OOB_GPU_SET_CLEAR_NEW);
-   } else {
-   WARN_ON(state != GMU_OOB_PERFCOUNTER_SET);
-   gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET,
- 

[PATCH 1/3] drm/msm: Fix race of GPU init vs timestamp power management.

2021-01-27 Thread Eric Anholt
We were using the same force-poweron bit in the two codepaths, so they
could race to have one of them lose GPU power early.

Signed-off-by: Eric Anholt 
Cc: sta...@vger.kernel.org # v5.9
---
 drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 25 ++---
 drivers/gpu/drm/msm/adreno/a6xx_gmu.h |  8 
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c |  4 ++--
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
index 78836b4fb98e..378dc7f190c3 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
@@ -264,6 +264,16 @@ int _a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum 
a6xx_gmu_oob_state state, char
}
name = "GPU_SET";
break;
+   case GMU_OOB_PERFCOUNTER_SET:
+   if (gmu->legacy) {
+   request = GMU_OOB_PERFCOUNTER_REQUEST;
+   ack = GMU_OOB_PERFCOUNTER_ACK;
+   } else {
+   request = GMU_OOB_PERFCOUNTER_REQUEST_NEW;
+   ack = GMU_OOB_PERFCOUNTER_ACK_NEW;
+   }
+   name = "PERFCOUNTER";
+   break;
case GMU_OOB_BOOT_SLUMBER:
request = GMU_OOB_BOOT_SLUMBER_REQUEST;
ack = GMU_OOB_BOOT_SLUMBER_ACK;
@@ -302,9 +312,14 @@ int _a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum 
a6xx_gmu_oob_state state, char
 void a6xx_gmu_clear_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
 {
if (!gmu->legacy) {
-   WARN_ON(state != GMU_OOB_GPU_SET);
-   gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET,
-   1 << GMU_OOB_GPU_SET_CLEAR_NEW);
+   if (state == GMU_OOB_GPU_SET) {
+   gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET,
+   1 << GMU_OOB_GPU_SET_CLEAR_NEW);
+   } else {
+   WARN_ON(state != GMU_OOB_PERFCOUNTER_SET);
+   gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET,
+   1 << GMU_OOB_PERFCOUNTER_CLEAR_NEW);
+   }
return;
}
 
@@ -313,6 +328,10 @@ void a6xx_gmu_clear_oob(struct a6xx_gmu *gmu, enum 
a6xx_gmu_oob_state state)
gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET,
1 << GMU_OOB_GPU_SET_CLEAR);
break;
+   case GMU_OOB_PERFCOUNTER_SET:
+   gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET,
+   1 << GMU_OOB_PERFCOUNTER_CLEAR);
+   break;
case GMU_OOB_BOOT_SLUMBER:
gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET,
1 << GMU_OOB_BOOT_SLUMBER_CLEAR);
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h 
b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
index c6d2bced8e5d..9fa278de2106 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h
@@ -156,6 +156,7 @@ enum a6xx_gmu_oob_state {
GMU_OOB_BOOT_SLUMBER = 0,
GMU_OOB_GPU_SET,
GMU_OOB_DCVS_SET,
+   GMU_OOB_PERFCOUNTER_SET,
 };
 
 /* These are the interrupt / ack bits for each OOB request that are set
@@ -190,6 +191,13 @@ enum a6xx_gmu_oob_state {
 #define GMU_OOB_GPU_SET_ACK_NEW31
 #define GMU_OOB_GPU_SET_CLEAR_NEW  31
 
+#define GMU_OOB_PERFCOUNTER_REQUEST17
+#define GMU_OOB_PERFCOUNTER_ACK25
+#define GMU_OOB_PERFCOUNTER_CLEAR  25
+
+#define GMU_OOB_PERFCOUNTER_REQUEST_NEW28
+#define GMU_OOB_PERFCOUNTER_ACK_NEW30
+#define GMU_OOB_PERFCOUNTER_CLEAR_NEW  30
 
 void a6xx_hfi_init(struct a6xx_gmu *gmu);
 int a6xx_hfi_start(struct a6xx_gmu *gmu, int boot_state);
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index c8a9010c1a1d..7424a70b9d35 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1177,12 +1177,12 @@ static int a6xx_get_timestamp(struct msm_gpu *gpu, 
uint64_t *value)
struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
 
/* Force the GPU power on so we can read this register */
-   a6xx_gmu_set_oob(_gpu->gmu, GMU_OOB_GPU_SET);
+   a6xx_gmu_set_oob(_gpu->gmu, GMU_OOB_PERFCOUNTER_SET);
 
*value = gpu_read64(gpu, REG_A6XX_RBBM_PERFCTR_CP_0_LO,
REG_A6XX_RBBM_PERFCTR_CP_0_HI);
 
-   a6xx_gmu_clear_oob(_gpu->gmu, GMU_OOB_GPU_SET);
+   a6xx_gmu_clear_oob(_gpu->gmu, GMU_OOB_PERFCOUNTER_SET);
return 0;
 }
 
-- 
2.30.0



Re: [PATCH net] net: Remove redundant calls of sk_tx_queue_clear().

2021-01-27 Thread Eric Dumazet
On Wed, Jan 27, 2021 at 6:56 PM Kuniyuki Iwashima  wrote:
>
> From:   Eric Dumazet 
> Date:   Wed, 27 Jan 2021 18:34:35 +0100
> > On Wed, Jan 27, 2021 at 6:32 PM Kuniyuki Iwashima  
> > wrote:
> > >
> > > From:   Eric Dumazet 
> > > Date:   Wed, 27 Jan 2021 18:05:24 +0100
> > > > On Wed, Jan 27, 2021 at 5:52 PM Kuniyuki Iwashima  
> > > > wrote:
> > > > >
> > > > > From:   Eric Dumazet 
> > > > > Date:   Wed, 27 Jan 2021 15:54:32 +0100
> > > > > > On Wed, Jan 27, 2021 at 1:50 PM Kuniyuki Iwashima 
> > > > > >  wrote:
> > > > > > >
> > > > > > > The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
> > > > > > > sk_set_socket()") removes sk_tx_queue_clear() from 
> > > > > > > sk_set_socket() and adds
> > > > > > > it instead in sk_alloc() and sk_clone_lock() to fix an issue 
> > > > > > > introduced in
> > > > > > > the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). 
> > > > > > > However,
> > > > > > > the original commit had already put sk_tx_queue_clear() in 
> > > > > > > sk_prot_alloc():
> > > > > > > the callee of sk_alloc() and sk_clone_lock(). Thus 
> > > > > > > sk_tx_queue_clear() is
> > > > > > > called twice in each path currently.
> > > > > >
> > > > > > Are you sure ?
> > > > > >
> > > > > > I do not clearly see the sk_tx_queue_clear() call from the cloning 
> > > > > > part.
> > > > > >
> > > > > > Please elaborate.
> > > > >
> > > > > If sk is not NULL in sk_prot_alloc(), sk_tx_queue_clear() is called 
> > > > > [1].
> > > > > Also the callers of sk_prot_alloc() are only sk_alloc() and 
> > > > > sk_clone_lock().
> > > > > If they finally return not NULL pointer, sk_tx_queue_clear() is 
> > > > > called in
> > > > > each function [2][3].
> > > > >
> > > > > In the cloning part, sock_copy() is called after sk_prot_alloc(), but
> > > > > skc_tx_queue_mapping is defined between skc_dontcopy_begin and
> > > > > skc_dontcopy_end in struct sock_common [4]. So, sock_copy() does not
> > > > > overwrite skc_tx_queue_mapping, and thus we can initialize it in
> > > > > sk_prot_alloc().
> > > >
> > > > That is a lot of assumptions.
> > > >
> > > > What guarantees do we have that skc_tx_queue_mapping will never be
> > > > moved out of this section ?
> > > > AFAIK it was there by accident, for cache locality reasons, that might
> > > > change in the future as we add more stuff in socket.
> > > >
> > > > I feel this optimization is risky for future changes, for a code path
> > > > that is spending thousands of cycles anyway.
> > >
> > > If someone try to move skc_tx_queue_mapping out of the section, should
> > > they take care about where it is used ?
>
> I'm sorry if it might be misleading, I would like to mean someone/they is
> the author of a patch to move skc_tx_queue_mapping.
>
>
> > Certainly not. You hide some knowledge, without a comment or some runtime 
> > check.
>
> It was my bad, I should have written about sock_copy() in the changelog.

I think you also want to add some compile time check.

BUILD_BUG_ON( skc_tx_queue_mapping is in the no copy area)

Because maintainers do not remember changelogs in their mind.


>
>
> > You can not ask us (maintainers) to remember thousands of tricks.
>
> I'll keep this in mind.
>
>
> > >
> > > But I agree that we should not write error-prone code.
> > >
> > > Currently, sk_tx_queue_clear() is the only initialization code in
> > > sk_prot_alloc(). So, does it make sense to remove sk_tx_queue_clear() in
> > > sk_prot_alloc() so that it does only allocation and other fields are
> > > initialized in each caller ?
>
> Can I ask what you think about this ?

Yes, this would be fine.

>
>
> > > > >
> > > > > [1] sk_prot_alloc
> > > > > https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1693
> > > > >
> > > > > [2] sk_alloc
> > > > > https://github.com/torvalds/linux/blob/master/net/c

Re: [PATCH net] net: Remove redundant calls of sk_tx_queue_clear().

2021-01-27 Thread Eric Dumazet
On Wed, Jan 27, 2021 at 6:32 PM Kuniyuki Iwashima  wrote:
>
> From:   Eric Dumazet 
> Date:   Wed, 27 Jan 2021 18:05:24 +0100
> > On Wed, Jan 27, 2021 at 5:52 PM Kuniyuki Iwashima  
> > wrote:
> > >
> > > From:   Eric Dumazet 
> > > Date:   Wed, 27 Jan 2021 15:54:32 +0100
> > > > On Wed, Jan 27, 2021 at 1:50 PM Kuniyuki Iwashima  
> > > > wrote:
> > > > >
> > > > > The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
> > > > > sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() 
> > > > > and adds
> > > > > it instead in sk_alloc() and sk_clone_lock() to fix an issue 
> > > > > introduced in
> > > > > the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). 
> > > > > However,
> > > > > the original commit had already put sk_tx_queue_clear() in 
> > > > > sk_prot_alloc():
> > > > > the callee of sk_alloc() and sk_clone_lock(). Thus 
> > > > > sk_tx_queue_clear() is
> > > > > called twice in each path currently.
> > > >
> > > > Are you sure ?
> > > >
> > > > I do not clearly see the sk_tx_queue_clear() call from the cloning part.
> > > >
> > > > Please elaborate.
> > >
> > > If sk is not NULL in sk_prot_alloc(), sk_tx_queue_clear() is called [1].
> > > Also the callers of sk_prot_alloc() are only sk_alloc() and 
> > > sk_clone_lock().
> > > If they finally return not NULL pointer, sk_tx_queue_clear() is called in
> > > each function [2][3].
> > >
> > > In the cloning part, sock_copy() is called after sk_prot_alloc(), but
> > > skc_tx_queue_mapping is defined between skc_dontcopy_begin and
> > > skc_dontcopy_end in struct sock_common [4]. So, sock_copy() does not
> > > overwrite skc_tx_queue_mapping, and thus we can initialize it in
> > > sk_prot_alloc().
> >
> > That is a lot of assumptions.
> >
> > What guarantees do we have that skc_tx_queue_mapping will never be
> > moved out of this section ?
> > AFAIK it was there by accident, for cache locality reasons, that might
> > change in the future as we add more stuff in socket.
> >
> > I feel this optimization is risky for future changes, for a code path
> > that is spending thousands of cycles anyway.
>
> If someone try to move skc_tx_queue_mapping out of the section, should
> they take care about where it is used ?

Certainly not. You hide some knowledge, without a comment or some runtime check.

You can not ask us (maintainers) to remember thousands of tricks.

>
> But I agree that we should not write error-prone code.
>
> Currently, sk_tx_queue_clear() is the only initialization code in
> sk_prot_alloc(). So, does it make sense to remove sk_tx_queue_clear() in
> sk_prot_alloc() so that it does only allocation and other fields are
> initialized in each caller ?
>
>
> > >
> > > [1] sk_prot_alloc
> > > https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1693
> > >
> > > [2] sk_alloc
> > > https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1762
> > >
> > > [3] sk_clone_lock
> > > https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1986
> > >
> > > [4] struct sock_common
> > > https://github.com/torvalds/linux/blob/master/include/net/sock.h#L218-L240
> > >
> > >
> > > > In any case, this seems to be a candidate for net-next, this is not
> > > > fixing a bug,
> > > > this would be an optimization at most, and potentially adding a bug.
> > > >
> > > > So if you resend this patch, you can mention the old commit in the 
> > > > changelog,
> > > > but do not add a dubious Fixes: tag
> > >
> > > I see.
> > >
> > > I will remove the tag and resend this as a net-next candidate.
> > >
> > > Thank you,
> > > Kuniyuki
> > >
> > >
> > > > >
> > > > > This patch removes the redundant calls of sk_tx_queue_clear() in 
> > > > > sk_alloc()
> > > > > and sk_clone_lock().
> > > > >
> > > > > Fixes: 41b14fb8724d ("net: Do not clear the sock TX queue in 
> > > > > sk_set_socket()")
> > > > > CC: Tariq Toukan 
> > > > > CC: Boris Pismenny 
> > > > > Signed-off-by: Kuniyuki Iwashi

Re: [PATCH net] net: Remove redundant calls of sk_tx_queue_clear().

2021-01-27 Thread Eric Dumazet
On Wed, Jan 27, 2021 at 5:52 PM Kuniyuki Iwashima  wrote:
>
> From:   Eric Dumazet 
> Date:   Wed, 27 Jan 2021 15:54:32 +0100
> > On Wed, Jan 27, 2021 at 1:50 PM Kuniyuki Iwashima  
> > wrote:
> > >
> > > The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
> > > sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() and 
> > > adds
> > > it instead in sk_alloc() and sk_clone_lock() to fix an issue introduced in
> > > the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). However,
> > > the original commit had already put sk_tx_queue_clear() in 
> > > sk_prot_alloc():
> > > the callee of sk_alloc() and sk_clone_lock(). Thus sk_tx_queue_clear() is
> > > called twice in each path currently.
> >
> > Are you sure ?
> >
> > I do not clearly see the sk_tx_queue_clear() call from the cloning part.
> >
> > Please elaborate.
>
> If sk is not NULL in sk_prot_alloc(), sk_tx_queue_clear() is called [1].
> Also the callers of sk_prot_alloc() are only sk_alloc() and sk_clone_lock().
> If they finally return not NULL pointer, sk_tx_queue_clear() is called in
> each function [2][3].
>
> In the cloning part, sock_copy() is called after sk_prot_alloc(), but
> skc_tx_queue_mapping is defined between skc_dontcopy_begin and
> skc_dontcopy_end in struct sock_common [4]. So, sock_copy() does not
> overwrite skc_tx_queue_mapping, and thus we can initialize it in
> sk_prot_alloc().

That is a lot of assumptions.

What guarantees do we have that skc_tx_queue_mapping will never be
moved out of this section ?
AFAIK it was there by accident, for cache locality reasons, that might
change in the future as we add more stuff in socket.

I feel this optimization is risky for future changes, for a code path
that is spending thousands of cycles anyway.

>
> [1] sk_prot_alloc
> https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1693
>
> [2] sk_alloc
> https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1762
>
> [3] sk_clone_lock
> https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1986
>
> [4] struct sock_common
> https://github.com/torvalds/linux/blob/master/include/net/sock.h#L218-L240
>
>
> > In any case, this seems to be a candidate for net-next, this is not
> > fixing a bug,
> > this would be an optimization at most, and potentially adding a bug.
> >
> > So if you resend this patch, you can mention the old commit in the 
> > changelog,
> > but do not add a dubious Fixes: tag
>
> I see.
>
> I will remove the tag and resend this as a net-next candidate.
>
> Thank you,
> Kuniyuki
>
>
> > >
> > > This patch removes the redundant calls of sk_tx_queue_clear() in 
> > > sk_alloc()
> > > and sk_clone_lock().
> > >
> > > Fixes: 41b14fb8724d ("net: Do not clear the sock TX queue in 
> > > sk_set_socket()")
> > > CC: Tariq Toukan 
> > > CC: Boris Pismenny 
> > > Signed-off-by: Kuniyuki Iwashima 
> > > Reviewed-by: Amit Shah 
> > > ---
> > >  net/core/sock.c | 2 --
> > >  1 file changed, 2 deletions(-)
> > >
> > > diff --git a/net/core/sock.c b/net/core/sock.c
> > > index bbcd4b97eddd..5c665ee14159 100644
> > > --- a/net/core/sock.c
> > > +++ b/net/core/sock.c
> > > @@ -1759,7 +1759,6 @@ struct sock *sk_alloc(struct net *net, int family, 
> > > gfp_t priority,
> > > cgroup_sk_alloc(>sk_cgrp_data);
> > > sock_update_classid(>sk_cgrp_data);
> > > sock_update_netprioidx(>sk_cgrp_data);
> > > -   sk_tx_queue_clear(sk);
> > > }
> > >
> > > return sk;
> > > @@ -1983,7 +1982,6 @@ struct sock *sk_clone_lock(const struct sock *sk, 
> > > const gfp_t priority)
> > >  */
> > > sk_refcnt_debug_inc(newsk);
> > > sk_set_socket(newsk, NULL);
> > > -   sk_tx_queue_clear(newsk);
> > > RCU_INIT_POINTER(newsk->sk_wq, NULL);
> > >
> > > if (newsk->sk_prot->sockets_allocated)
> > > --
> > > 2.17.2 (Apple Git-113)
> > >


Re: [PATCH v4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-27 Thread Eric Snowberg


> On Jan 27, 2021, at 7:03 AM, Mimi Zohar  wrote:
> 
> [Cc'ing linux-integrity]
> 
> On Wed, 2021-01-27 at 11:46 +, David Howells wrote:
>> Jarkko Sakkinen  wrote:
>> 
 I suppose a user space tool could be created. But wouldn’t what is
 currently done in the kernel in this area need to be removed?
>>> 
>>> Right. I don't think this was a great idea in the first place to
>>> do to the kernel but since it exists, I guess the patch does make
>>> sense.
>> 
>> This information needs to be loaded from the UEFI tables before the system
>> starts loading any kernel modules or running any programs (if we do
>> verification of such, which I think IMA can do).
> 
> There needs to a clear distinction between the pre-boot and post-boot
> keys.  UEFI has its own trust model, which should be limited to UEFI. 
> The .platform keyring was upstreamed and limited to verifying the kexec
> kernel image.   Any other usage of the .platform keyring keys is
> abusing its intended purpose.
> 
> The cover letter says,   "Anytime the .platform keyring is used, the
> keys in the .blacklist keyring are referenced, if a matching key is
> found, the key will be rejected."   I don't have a problem with loading
> the UEFI X509 dbx entries as long as its usage is limited to verifying
> the kexec kernel image.

Correct, with my patch, when EFI_CERT_X509_GUID entries are found in the
dbx, they will only be used during kexec.  I believe the latest dbx file on 
uefi.org contains three of these entires.

Based on my understanding of why the platform keyring was introduced, 
I intentionally only used these for kexec.  I do question the current 
upstream mainline code though.  Currently, when EFI_CERT_X509_SHA256_GUID
or EFI_CERT_SHA256_GUID entries are found in the dbx, they are applied 
everywhere.  It seems like there should be a dbx revocation keyring 
equivalent to the current platform keyring that is only used for pre-boot. 

If that is a direction you would like to see this go in the future, let
me know, I’d be happy to work on it.



Re: [PATCH net] net: Remove redundant calls of sk_tx_queue_clear().

2021-01-27 Thread Eric Dumazet
On Wed, Jan 27, 2021 at 1:50 PM Kuniyuki Iwashima  wrote:
>
> The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
> sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() and adds
> it instead in sk_alloc() and sk_clone_lock() to fix an issue introduced in
> the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). However,
> the original commit had already put sk_tx_queue_clear() in sk_prot_alloc():
> the callee of sk_alloc() and sk_clone_lock(). Thus sk_tx_queue_clear() is
> called twice in each path currently.

Are you sure ?

I do not clearly see the sk_tx_queue_clear() call from the cloning part.

Please elaborate.

In any case, this seems to be a candidate for net-next, this is not
fixing a bug,
this would be an optimization at most, and potentially adding a bug.

So if you resend this patch, you can mention the old commit in the changelog,
but do not add a dubious Fixes: tag


>
> This patch removes the redundant calls of sk_tx_queue_clear() in sk_alloc()
> and sk_clone_lock().
>
> Fixes: 41b14fb8724d ("net: Do not clear the sock TX queue in sk_set_socket()")
> CC: Tariq Toukan 
> CC: Boris Pismenny 
> Signed-off-by: Kuniyuki Iwashima 
> Reviewed-by: Amit Shah 
> ---
>  net/core/sock.c | 2 --
>  1 file changed, 2 deletions(-)
>
> diff --git a/net/core/sock.c b/net/core/sock.c
> index bbcd4b97eddd..5c665ee14159 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1759,7 +1759,6 @@ struct sock *sk_alloc(struct net *net, int family, 
> gfp_t priority,
> cgroup_sk_alloc(>sk_cgrp_data);
> sock_update_classid(>sk_cgrp_data);
> sock_update_netprioidx(>sk_cgrp_data);
> -   sk_tx_queue_clear(sk);
> }
>
> return sk;
> @@ -1983,7 +1982,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
> gfp_t priority)
>  */
> sk_refcnt_debug_inc(newsk);
> sk_set_socket(newsk, NULL);
> -   sk_tx_queue_clear(newsk);
> RCU_INIT_POINTER(newsk->sk_wq, NULL);
>
> if (newsk->sk_prot->sockets_allocated)
> --
> 2.17.2 (Apple Git-113)
>


Re: [PATCH RFC v1 00/15] iommu/virtio: Nested stage support with Arm

2021-01-26 Thread Auger Eric
Hi Vivek,

On 1/21/21 6:34 PM, Vivek Kumar Gautam wrote:
> Hi Eric,
> 
> 
> On 1/19/21 2:33 PM, Auger Eric wrote:
>> Hi Vivek,
>>
>> On 1/15/21 1:13 PM, Vivek Gautam wrote:
>>> This patch-series aims at enabling Nested stage translation in guests
>>> using virtio-iommu as the paravirtualized iommu. The backend is
>>> supported
>>> with Arm SMMU-v3 that provides nested stage-1 and stage-2 translation.
>>>
>>> This series derives its purpose from various efforts happening to add
>>> support for Shared Virtual Addressing (SVA) in host and guest. On Arm,
>>> most of the support for SVA has already landed. The support for nested
>>> stage translation and fault reporting to guest has been proposed [1].
>>> The related changes required in VFIO [2] framework have also been put
>>> forward.
>>>
>>> This series proposes changes in virtio-iommu to program PASID tables
>>> and related stage-1 page tables. A simple iommu-pasid-table library
>>> is added for this purpose that interacts with vendor drivers to
>>> allocate and populate PASID tables.
>>> In Arm SMMUv3 we propose to pull the Context Descriptor (CD) management
>>> code out of the arm-smmu-v3 driver and add that as a glue vendor layer
>>> to support allocating CD tables, and populating them with right values.
>>> These CD tables are essentially the PASID tables and contain stage-1
>>> page table configurations too.
>>> A request to setup these CD tables come from virtio-iommu driver using
>>> the iommu-pasid-table library when running on Arm. The virtio-iommu
>>> then pass these PASID tables to the host using the right virtio backend
>>> and support in VMM.
>>>
>>> For testing we have added necessary support in kvmtool. The changes in
>>> kvmtool are based on virtio-iommu development branch by Jean-Philippe
>>> Brucker [3].
>>>
>>> The tested kernel branch contains following in the order bottom to top
>>> on the git hash -
>>> a) v5.11-rc3
>>> b) arm-smmu-v3 [1] and vfio [2] changes from Eric to add nested page
>>>     table support for Arm.
>>> c) Smmu test engine patches from Jean-Philippe's branch [4]
>>> d) This series
>>> e) Domain nesting info patches [5][6][7].
>>> f) Changes to add arm-smmu-v3 specific nesting info (to be sent to
>>>     the list).
>>>
>>> This kernel is tested on Neoverse reference software stack with
>>> Fixed virtual platform. Public version of the software stack and
>>> FVP is available here[8][9].
>>>
>>> A big thanks to Jean-Philippe for his contributions towards this work
>>> and for his valuable guidance.
>>>
>>> [1]
>>> https://lore.kernel.org/linux-iommu/20201118112151.25412-1-eric.au...@redhat.com/T/
>>>
>>> [2]
>>> https://lore.kernel.org/kvmarm/20201116110030.32335-12-eric.au...@redhat.com/T/
>>>
>>> [3] https://jpbrucker.net/git/kvmtool/log/?h=virtio-iommu/devel
>>> [4] https://jpbrucker.net/git/linux/log/?h=sva/smmute
>>> [5]
>>> https://lore.kernel.org/kvm/1599734733-6431-2-git-send-email-yi.l@intel.com/
>>>
>>> [6]
>>> https://lore.kernel.org/kvm/1599734733-6431-3-git-send-email-yi.l@intel.com/
>>>
>>> [7]
>>> https://lore.kernel.org/kvm/1599734733-6431-4-git-send-email-yi.l@intel.com/
>>>
>>> [8]
>>> https://developer.arm.com/tools-and-software/open-source-software/arm-platforms-software/arm-ecosystem-fvps
>>>
>>> [9]
>>> https://git.linaro.org/landing-teams/working/arm/arm-reference-platforms.git/about/docs/rdn1edge/user-guide.rst
>>>
>>
>> Could you share a public branch where we could find all the kernel
>> pieces.
>>
>> Thank you in advance
> 
> Apologies for the delay. It took a bit of time to sort things out for a
> public branch.
> The branch is available in my github now. Please have a look.
> 
> https://github.com/vivek-arm/linux/tree/5.11-rc3-nested-pgtbl-arm-smmuv3-virtio-iommu

no problem. Thank you for the link.

Best Regards

Eric
> 
> 
> 
> Thanks and regards
> Vivek
> 
>>
>> Best Regards
>>
>> Eric
>>>
>>> Jean-Philippe Brucker (6):
>>>    iommu/virtio: Add headers for table format probing
>>>    iommu/virtio: Add table format probing
>>>    iommu/virtio: Add headers for binding pasid table in iommu
>>>    iommu/virtio: Add support for INVALIDATE request
>>>    iommu/virtio: Attach 

Re: WARNING in pskb_expand_head

2021-01-25 Thread Eric Dumazet
On Mon, Jan 25, 2021 at 10:12 PM Jakub Kicinski  wrote:
>
> CC Willem just in case


This is an old bug really, tun_napi_alloc_frags() does not make sure
its @len argument is not too big.

Since __skb_grow() does not use __GFP_NOWARN we end up with this well
known warning in mm layer.

I would use the following fix :


diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 
62690baa19bc8c4bf52f8b18a092f570e2125fc8..a0740e40a145fa2e175edd2180d369859c5d786b
100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1367,13 +1367,16 @@ static struct sk_buff
*tun_napi_alloc_frags(struct tun_file *tfile,
if (it->nr_segs > MAX_SKB_FRAGS + 1)
return ERR_PTR(-EMSGSIZE);

+   linear = iov_iter_single_seg_count(it);
+   if (linear > SKB_MAX_ALLOC)
+   return ERR_PTR(-EMSGSIZE);
+
local_bh_disable();
skb = napi_get_frags(>napi);
local_bh_enable();
if (!skb)
return ERR_PTR(-ENOMEM);

-   linear = iov_iter_single_seg_count(it);
err = __skb_grow(skb, linear);
if (err)
goto free;


>
> On Sun, 24 Jan 2021 12:51:20 -0800 syzbot wrote:
> > Hello,
> >
> > syzbot found the following issue on:
> >
> > HEAD commit:7d68e382 bpf: Permit size-0 datasec
> > git tree:   bpf-next
> > console output: https://syzkaller.appspot.com/x/log.txt?x=132567e750
> > kernel config:  https://syzkaller.appspot.com/x/.config?x=e0c7843b8af99dff
> > dashboard link: https://syzkaller.appspot.com/bug?extid=a1c17e56a8a62294c714
> > compiler:   gcc (GCC) 10.1.0-syz 20200507
> > syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=17ae23af50
> > C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=13856bc750
> >
> > The issue was bisected to:
> >
> > commit 3226b158e67cfaa677fd180152bfb28989cb2fac
> > Author: Eric Dumazet 
> > Date:   Wed Jan 13 16:18:19 2021 +
> >
> > net: avoid 32 x truesize under-estimation for tiny skbs
> >
> > bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=151a302750
> > final oops: https://syzkaller.appspot.com/x/report.txt?x=171a302750
> > console output: https://syzkaller.appspot.com/x/log.txt?x=131a302750
> >
> > IMPORTANT: if you fix the issue, please add the following tag to the commit:
> > Reported-by: syzbot+a1c17e56a8a62294c...@syzkaller.appspotmail.com
> > Fixes: 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny 
> > skbs")
> >
> > RBP:  R08: 0001 R09: 01bb
> > R10:  R11: 0246 R12: 
> > R13: 0004 R14:  R15: 
> > [ cut here ]
> > WARNING: CPU: 1 PID: 8703 at mm/page_alloc.c:4976 
> > __alloc_pages_nodemask+0x5f8/0x730 mm/page_alloc.c:5011
> > Modules linked in:
> > CPU: 1 PID: 8703 Comm: syz-executor857 Not tainted 5.11.0-rc3-syzkaller #0
> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> > Google 01/01/2011
> > RIP: 0010:__alloc_pages_nodemask+0x5f8/0x730 mm/page_alloc.c:4976
> > Code: 00 00 0c 00 0f 85 a7 00 00 00 8b 3c 24 4c 89 f2 44 89 e6 c6 44 24 70 
> > 00 48 89 6c 24 58 e8 d0 d7 ff ff 49 89 c5 e9 ea fc ff ff <0f> 0b e9 b5 fd 
> > ff ff 89 74 24 14 4c 89 4c 24 08 4c 89 74 24 18 e8
> > RSP: 0018:c90001ecf910 EFLAGS: 00010246
> > RAX:  RBX: 1920003d9f26 RCX: 
> > RDX:  RSI: dc00 RDI: 00060a20
> > RBP: 00020a20 R08:  R09: 0001
> > R10: 86f1be3c R11:  R12: 0012
> > R13: 20010300 R14: 00060a20 R15: 
> > FS:  01148880() GS:8880b9f0() knlGS:
> > CS:  0010 DS:  ES:  CR0: 80050033
> > CR2: 006d5090 CR3: 1d414000 CR4: 001506e0
> > DR0:  DR1:  DR2: 
> > DR3:  DR6: fffe0ff0 DR7: 0400
> > Call Trace:
> >  __alloc_pages include/linux/gfp.h:511 [inline]
> >  __alloc_pages_node include/linux/gfp.h:524 [inline]
> >  alloc_pages_node include/linux/gfp.h:538 [inline]
> >  kmalloc_large_node+0x60/0x110 mm/slub.c:3984
> >  __kmalloc_node_track_caller+0x319/0x3f0 mm/slub.c:4481
> >  __kmalloc_reserve net/core/skbuff.c:150 [inline]
> >  pskb_expand_head+0xae9/0x1050 net/core/skbuff.c:1632
> >  __skb_grow include/linux/skbuff.h:2748 [inline]
> >  tun_napi_alloc_frags drivers/net/tun.c

Re: [PATCH net] tcp: make TCP_USER_TIMEOUT accurate for zero window probes

2021-01-22 Thread Eric Dumazet
On Fri, Jan 22, 2021 at 8:13 PM Enke Chen  wrote:
>
> From: Enke Chen 
>
> The TCP_USER_TIMEOUT is checked by the 0-window probe timer. As the
> timer has backoff with a max interval of about two minutes, the
> actual timeout for TCP_USER_TIMEOUT can be off by up to two minutes.
>
> In this patch the TCP_USER_TIMEOUT is made more accurate by taking it
> into account when computing the timer value for the 0-window probes.
>
> This patch is similar to the one that made TCP_USER_TIMEOUT accurate for
> RTOs in commit b701a99e431d ("tcp: Add tcp_clamp_rto_to_user_timeout()
> helper to improve accuracy").
>
> Signed-off-by: Enke Chen 
> Reviewed-by: Neal Cardwell 
> ---

SGTM, thanks !

Signed-off-by: Eric Dumazet 


[PATCH v5 4/4] integrity: Load mokx variables into the blacklist keyring

2021-01-22 Thread Eric Snowberg
During boot the Secure Boot Forbidden Signature Database, dbx,
is loaded into the blacklist keyring.  Systems booted with shim
have an equivalent Forbidden Signature Database called mokx.
Currently mokx is only used by shim and grub, the contents are
ignored by the kernel.

Add the ability to load mokx into the blacklist keyring during boot.

Signed-off-by: Eric Snowberg 
Suggested-by: James Bottomley 
---
 security/integrity/platform_certs/load_uefi.c | 20 +--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/security/integrity/platform_certs/load_uefi.c 
b/security/integrity/platform_certs/load_uefi.c
index ee4b4c666854..f290f78c3f30 100644
--- a/security/integrity/platform_certs/load_uefi.c
+++ b/security/integrity/platform_certs/load_uefi.c
@@ -132,8 +132,9 @@ static int __init load_moklist_certs(void)
 static int __init load_uefi_certs(void)
 {
efi_guid_t secure_var = EFI_IMAGE_SECURITY_DATABASE_GUID;
-   void *db = NULL, *dbx = NULL;
-   unsigned long dbsize = 0, dbxsize = 0;
+   efi_guid_t mok_var = EFI_SHIM_LOCK_GUID;
+   void *db = NULL, *dbx = NULL, *mokx = NULL;
+   unsigned long dbsize = 0, dbxsize = 0, mokxsize = 0;
efi_status_t status;
int rc = 0;
 
@@ -175,6 +176,21 @@ static int __init load_uefi_certs(void)
kfree(dbx);
}
 
+   mokx = get_cert_list(L"MokListXRT", _var, , );
+   if (!mokx) {
+   if (status == EFI_NOT_FOUND)
+   pr_debug("mokx variable wasn't found\n");
+   else
+   pr_info("Couldn't get mokx list\n");
+   } else {
+   rc = parse_efi_signature_list("UEFI:MokListXRT",
+ mokx, mokxsize,
+ get_handler_for_dbx);
+   if (rc)
+   pr_err("Couldn't parse mokx signatures %d\n", rc);
+   kfree(mokx);
+   }
+
/* Load the MokListRT certs */
rc = load_moklist_certs();
 
-- 
2.18.4



[PATCH v5 1/4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-22 Thread Eric Snowberg
This fixes CVE-2020-26541.

The Secure Boot Forbidden Signature Database, dbx, contains a list of now
revoked signatures and keys previously approved to boot with UEFI Secure
Boot enabled.  The dbx is capable of containing any number of
EFI_CERT_X509_SHA256_GUID, EFI_CERT_SHA256_GUID, and EFI_CERT_X509_GUID
entries.

Currently when EFI_CERT_X509_GUID are contained in the dbx, the entries are
skipped.

Add support for EFI_CERT_X509_GUID dbx entries. When a EFI_CERT_X509_GUID
is found, it is added as an asymmetrical key to the .blacklist keyring.
Anytime the .platform keyring is used, the keys in the .blacklist keyring
are referenced, if a matching key is found, the key will be rejected.

Signed-off-by: Eric Snowberg 
Reviewed-by: Jarkko Sakkinen 
Signed-off-by: David Howells 
---
v5: Function name changes done by David Howells
---
 certs/blacklist.c | 32 +++
 certs/blacklist.h | 12 +++
 certs/system_keyring.c|  6 
 include/keys/system_keyring.h | 11 +++
 .../platform_certs/keyring_handler.c  | 11 +++
 5 files changed, 72 insertions(+)

diff --git a/certs/blacklist.c b/certs/blacklist.c
index 6514f9ebc943..a7f021878a4b 100644
--- a/certs/blacklist.c
+++ b/certs/blacklist.c
@@ -100,6 +100,38 @@ int mark_hash_blacklisted(const char *hash)
return 0;
 }
 
+int add_key_to_revocation_list(const char *data, size_t size)
+{
+   key_ref_t key;
+
+   key = key_create_or_update(make_key_ref(blacklist_keyring, true),
+  "asymmetric",
+  NULL,
+  data,
+  size,
+  ((KEY_POS_ALL & ~KEY_POS_SETATTR) | 
KEY_USR_VIEW),
+  KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN);
+
+   if (IS_ERR(key)) {
+   pr_err("Problem with revocation key (%ld)\n", PTR_ERR(key));
+   return PTR_ERR(key);
+   }
+
+   return 0;
+}
+
+int is_key_on_revocation_list(struct pkcs7_message *pkcs7)
+{
+   int ret;
+
+   ret = validate_trust(pkcs7, blacklist_keyring);
+
+   if (ret == 0)
+   return -EKEYREJECTED;
+
+   return -ENOKEY;
+}
+
 /**
  * is_hash_blacklisted - Determine if a hash is blacklisted
  * @hash: The hash to be checked as a binary blob
diff --git a/certs/blacklist.h b/certs/blacklist.h
index 1efd6fa0dc60..420bb7c86e07 100644
--- a/certs/blacklist.h
+++ b/certs/blacklist.h
@@ -1,3 +1,15 @@
 #include 
+#include 
+#include 
 
 extern const char __initconst *const blacklist_hashes[];
+
+#ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING
+#define validate_trust pkcs7_validate_trust
+#else
+static inline int validate_trust(struct pkcs7_message *pkcs7,
+struct key *trust_keyring)
+{
+   return -ENOKEY;
+}
+#endif
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index 798291177186..cc165b359ea3 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -241,6 +241,12 @@ int verify_pkcs7_message_sig(const void *data, size_t len,
pr_devel("PKCS#7 platform keyring is not available\n");
goto error;
}
+
+   ret = is_key_on_revocation_list(pkcs7);
+   if (ret != -ENOKEY) {
+   pr_devel("PKCS#7 platform key is on revocation list\n");
+   goto error;
+   }
}
ret = pkcs7_validate_trust(pkcs7, trusted_keys);
if (ret < 0) {
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
index fb8b07daa9d1..61f98739e8b1 100644
--- a/include/keys/system_keyring.h
+++ b/include/keys/system_keyring.h
@@ -31,11 +31,14 @@ extern int restrict_link_by_builtin_and_secondary_trusted(
 #define restrict_link_by_builtin_and_secondary_trusted 
restrict_link_by_builtin_trusted
 #endif
 
+extern struct pkcs7_message *pkcs7;
 #ifdef CONFIG_SYSTEM_BLACKLIST_KEYRING
 extern int mark_hash_blacklisted(const char *hash);
+extern int add_key_to_revocation_list(const char *data, size_t size);
 extern int is_hash_blacklisted(const u8 *hash, size_t hash_len,
   const char *type);
 extern int is_binary_blacklisted(const u8 *hash, size_t hash_len);
+extern int is_key_on_revocation_list(struct pkcs7_message *pkcs7);
 #else
 static inline int is_hash_blacklisted(const u8 *hash, size_t hash_len,
  const char *type)
@@ -47,6 +50,14 @@ static inline int is_binary_blacklisted(const u8 *hash, 
size_t hash_len)
 {
return 0;
 }
+static inline int add_key_to_revocation_list(const char *data, size_t size)
+{
+   return 0;
+}
+static inline int is_key_on_revocation_list(struct pkcs7_message *pkcs7)
+{
+   return -ENOKEY;
+}
 #endif
 
 #ifdef CONFIG_IMA_BL

[PATCH v5 0/4] Add EFI_CERT_X509_GUID support for dbx/mokx entries

2021-01-22 Thread Eric Snowberg
This is the fifth patch series for adding support for 
EFI_CERT_X509_GUID entries [1].  It has been expanded to not only include
dbx entries but also entries in the mokx.  Additionally my series to
preload these certificate [2] has also been included.

This series is based on v5.11-rc4.

[1] 
https://patchwork.kernel.org/project/linux-security-module/patch/20200916004927.64276-1-eric.snowb...@oracle.com/
[2] https://lore.kernel.org/patchwork/cover/1315485/

Eric Snowberg (4):
  certs: Add EFI_CERT_X509_GUID support for dbx entries
  certs: Move load_system_certificate_list to a common function
  certs: Add ability to preload revocation certs
  integrity: Load mokx variables into the blacklist keyring

 certs/Kconfig |  8 +++
 certs/Makefile| 20 ++-
 certs/blacklist.c | 49 
 certs/blacklist.h | 12 
 certs/common.c| 56 +++
 certs/common.h|  9 +++
 certs/revocation_certificates.S   | 21 +++
 certs/system_keyring.c| 55 +++---
 include/keys/system_keyring.h | 11 
 scripts/Makefile  |  1 +
 .../platform_certs/keyring_handler.c  | 11 
 security/integrity/platform_certs/load_uefi.c | 20 ++-
 12 files changed, 222 insertions(+), 51 deletions(-)
 create mode 100644 certs/common.c
 create mode 100644 certs/common.h
 create mode 100644 certs/revocation_certificates.S


base-commit: 19c329f6808995b142b3966301f217c831e7cf31
-- 
2.18.4



[PATCH v5 3/4] certs: Add ability to preload revocation certs

2021-01-22 Thread Eric Snowberg
Add a new Kconfig option called SYSTEM_REVOCATION_KEYS. If set,
this option should be the filename of a PEM-formated file containing
X.509 certificates to be included in the default blacklist keyring.

Signed-off-by: Eric Snowberg 
Acked-by: Jarkko Sakkinen 
---
 certs/Kconfig   |  8 
 certs/Makefile  | 18 --
 certs/blacklist.c   | 17 +
 certs/revocation_certificates.S | 21 +
 scripts/Makefile|  1 +
 5 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 certs/revocation_certificates.S

diff --git a/certs/Kconfig b/certs/Kconfig
index c94e93d8bccf..379a6e198459 100644
--- a/certs/Kconfig
+++ b/certs/Kconfig
@@ -83,4 +83,12 @@ config SYSTEM_BLACKLIST_HASH_LIST
  wrapper to incorporate the list into the kernel.  Each  should
  be a string of hex digits.
 
+config SYSTEM_REVOCATION_KEYS
+   string "X.509 certificates to be preloaded into the system blacklist 
keyring"
+   depends on SYSTEM_BLACKLIST_KEYRING
+   help
+ If set, this option should be the filename of a PEM-formatted file
+ containing X.509 certificates to be included in the default blacklist
+ keyring.
+
 endmenu
diff --git a/certs/Makefile b/certs/Makefile
index f4b90bad8690..e3f4926fd21e 100644
--- a/certs/Makefile
+++ b/certs/Makefile
@@ -4,7 +4,7 @@
 #
 
 obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o 
common.o
-obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o
+obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o 
revocation_certificates.o common.o
 ifneq ($(CONFIG_SYSTEM_BLACKLIST_HASH_LIST),"")
 obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist_hashes.o
 else
@@ -29,7 +29,7 @@ $(obj)/x509_certificate_list: scripts/extract-cert 
$(SYSTEM_TRUSTED_KEYS_SRCPREF
$(call 
if_changed,extract_certs,$(SYSTEM_TRUSTED_KEYS_SRCPREFIX)$(CONFIG_SYSTEM_TRUSTED_KEYS))
 endif # CONFIG_SYSTEM_TRUSTED_KEYRING
 
-clean-files := x509_certificate_list .x509.list
+clean-files := x509_certificate_list .x509.list x509_revocation_list
 
 ifeq ($(CONFIG_MODULE_SIG),y)
 ###
@@ -104,3 +104,17 @@ targets += signing_key.x509
 $(obj)/signing_key.x509: scripts/extract-cert $(X509_DEP) FORCE
$(call 
if_changed,extract_certs,$(MODULE_SIG_KEY_SRCPREFIX)$(CONFIG_MODULE_SIG_KEY))
 endif # CONFIG_MODULE_SIG
+
+ifeq ($(CONFIG_SYSTEM_BLACKLIST_KEYRING),y)
+
+$(eval $(call config_filename,SYSTEM_REVOCATION_KEYS))
+
+$(obj)/revocation_certificates.o: $(obj)/x509_revocation_list
+
+quiet_cmd_extract_certs  = EXTRACT_CERTS   $(patsubst "%",%,$(2))
+  cmd_extract_certs  = scripts/extract-cert $(2) $@
+
+targets += x509_revocation_list
+$(obj)/x509_revocation_list: scripts/extract-cert 
$(SYSTEM_REVOCATION_KEYS_SRCPREFIX)$(SYSTEM_REVOCATION_KEYS_FILENAME) FORCE
+   $(call 
if_changed,extract_certs,$(SYSTEM_REVOCATION_KEYS_SRCPREFIX)$(CONFIG_SYSTEM_REVOCATION_KEYS))
+endif
diff --git a/certs/blacklist.c b/certs/blacklist.c
index a7f021878a4b..4e8a1068adb2 100644
--- a/certs/blacklist.c
+++ b/certs/blacklist.c
@@ -16,9 +16,13 @@
 #include 
 #include 
 #include "blacklist.h"
+#include "common.h"
 
 static struct key *blacklist_keyring;
 
+extern __initconst const u8 revocation_certificate_list[];
+extern __initconst const unsigned long revocation_certificate_list_size;
+
 /*
  * The description must be a type prefix, a colon and then an even number of
  * hex digits.  The hash is kept in the description.
@@ -209,3 +213,16 @@ static int __init blacklist_init(void)
  * Must be initialised before we try and load the keys into the keyring.
  */
 device_initcall(blacklist_init);
+
+/*
+ * Load the compiled-in list of revocation X.509 certificates.
+ */
+static __init int load_revocation_certificate_list(void)
+{
+   if (revocation_certificate_list_size)
+   pr_notice("Loading compiled-in revocation X.509 
certificates\n");
+
+   return load_certificate_list(revocation_certificate_list, 
revocation_certificate_list_size,
+blacklist_keyring);
+}
+late_initcall(load_revocation_certificate_list);
diff --git a/certs/revocation_certificates.S b/certs/revocation_certificates.S
new file mode 100644
index ..f21aae8a8f0e
--- /dev/null
+++ b/certs/revocation_certificates.S
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include 
+#include 
+
+   __INITRODATA
+
+   .align 8
+   .globl revocation_certificate_list
+revocation_certificate_list:
+__revocation_list_start:
+   .incbin "certs/x509_revocation_list"
+__revocation_list_end:
+
+   .align 8
+   .globl revocation_certificate_list_size
+revocation_certificate_list_size:
+#ifdef CONFIG_64BIT
+   .quad __revocation_list_end - __revocation_list_start
+#else
+   

[PATCH v5 2/4] certs: Move load_system_certificate_list to a common function

2021-01-22 Thread Eric Snowberg
Move functionality within load_system_certificate_list to a common
function, so it can be reused in the future.

Signed-off-by: Eric Snowberg 
Acked-by: Jarkko Sakkinen 
---
 certs/Makefile |  2 +-
 certs/common.c | 56 ++
 certs/common.h |  9 +++
 certs/system_keyring.c | 49 +++-
 4 files changed, 69 insertions(+), 47 deletions(-)
 create mode 100644 certs/common.c
 create mode 100644 certs/common.h

diff --git a/certs/Makefile b/certs/Makefile
index f4c25b67aad9..f4b90bad8690 100644
--- a/certs/Makefile
+++ b/certs/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the linux kernel signature checking certificates.
 #
 
-obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
+obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o 
common.o
 obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o
 ifneq ($(CONFIG_SYSTEM_BLACKLIST_HASH_LIST),"")
 obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist_hashes.o
diff --git a/certs/common.c b/certs/common.c
new file mode 100644
index ..83800f51a1a1
--- /dev/null
+++ b/certs/common.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include 
+#include 
+
+int load_certificate_list(const u8 cert_list[],
+ const unsigned long list_size,
+ const struct key *keyring)
+{
+   key_ref_t key;
+   const u8 *p, *end;
+   size_t plen;
+
+   p = cert_list;
+   end = p + list_size;
+   while (p < end) {
+   /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
+* than 256 bytes in size.
+*/
+   if (end - p < 4)
+   goto dodgy_cert;
+   if (p[0] != 0x30 &&
+   p[1] != 0x82)
+   goto dodgy_cert;
+   plen = (p[2] << 8) | p[3];
+   plen += 4;
+   if (plen > end - p)
+   goto dodgy_cert;
+
+   key = key_create_or_update(make_key_ref(keyring, 1),
+  "asymmetric",
+  NULL,
+  p,
+  plen,
+  ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+  KEY_USR_VIEW | KEY_USR_READ),
+  KEY_ALLOC_NOT_IN_QUOTA |
+  KEY_ALLOC_BUILT_IN |
+  KEY_ALLOC_BYPASS_RESTRICTION);
+   if (IS_ERR(key)) {
+   pr_err("Problem loading in-kernel X.509 certificate 
(%ld)\n",
+  PTR_ERR(key));
+   } else {
+   pr_notice("Loaded X.509 cert '%s'\n",
+ key_ref_to_ptr(key)->description);
+   key_ref_put(key);
+   }
+   p += plen;
+   }
+
+   return 0;
+
+dodgy_cert:
+   pr_err("Problem parsing in-kernel X.509 certificate list\n");
+   return 0;
+}
diff --git a/certs/common.h b/certs/common.h
new file mode 100644
index ..abdb5795936b
--- /dev/null
+++ b/certs/common.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _CERT_COMMON_H
+#define _CERT_COMMON_H
+
+int load_certificate_list(const u8 cert_list[], const unsigned long list_size,
+ const struct key *keyring);
+
+#endif
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index cc165b359ea3..a44a8915c94c 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include "common.h"
 
 static struct key *builtin_trusted_keys;
 #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING
@@ -136,54 +137,10 @@ device_initcall(system_trusted_keyring_init);
  */
 static __init int load_system_certificate_list(void)
 {
-   key_ref_t key;
-   const u8 *p, *end;
-   size_t plen;
-
pr_notice("Loading compiled-in X.509 certificates\n");
 
-   p = system_certificate_list;
-   end = p + system_certificate_list_size;
-   while (p < end) {
-   /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
-* than 256 bytes in size.
-*/
-   if (end - p < 4)
-   goto dodgy_cert;
-   if (p[0] != 0x30 &&
-   p[1] != 0x82)
-   goto dodgy_cert;
-   plen = (p[2] << 8) | p[3];
-   plen += 4;
-   if (plen > end - p)
-   goto dodgy_cert;
-
-   key = key_create_or_update(make_key_ref(builtin_

Re: [RFC PATCH v3 1/8] Use refcount_t for ucounts reference counting

2021-01-21 Thread Eric W. Biederman
Alexey Gladkov  writes:

> On Tue, Jan 19, 2021 at 07:57:36PM -0600, Eric W. Biederman wrote:
>> Alexey Gladkov  writes:
>> 
>> > On Mon, Jan 18, 2021 at 12:34:29PM -0800, Linus Torvalds wrote:
>> >> On Mon, Jan 18, 2021 at 11:46 AM Alexey Gladkov
>> >>  wrote:
>> >> >
>> >> > Sorry about that. I thought that this code is not needed when switching
>> >> > from int to refcount_t. I was wrong.
>> >> 
>> >> Well, you _may_ be right. I personally didn't check how the return
>> >> value is used.
>> >> 
>> >> I only reacted to "it certainly _may_ be used, and there is absolutely
>> >> no comment anywhere about why it wouldn't matter".
>> >
>> > I have not found examples where checked the overflow after calling
>> > refcount_inc/refcount_add.
>> >
>> > For example in kernel/fork.c:2298 :
>> >
>> >current->signal->nr_threads++;   
>> >atomic_inc(>signal->live);  
>> >refcount_inc(>signal->sigcnt);  
>> >
>> > $ semind search signal_struct.sigcnt
>> > def include/linux/sched/signal.h:83refcount_t  
>> > sigcnt;
>> > m-- kernel/fork.c:723 put_signal_structif 
>> > (refcount_dec_and_test(>sigcnt))
>> > m-- kernel/fork.c:1571 copy_signal 
>> > refcount_set(>sigcnt, 1);
>> > m-- kernel/fork.c:2298 copy_process
>> > refcount_inc(>signal->sigcnt);
>> >
>> > It seems to me that the only way is to use __refcount_inc and then compare
>> > the old value with REFCOUNT_MAX
>> >
>> > Since I have not seen examples of such checks, I thought that this is
>> > acceptable. Sorry once again. I have not tried to hide these changes.
>> 
>> The current ucount code does check for overflow and fails the increment
>> in every case.
>> 
>> So arguably it will be a regression and inferior error handling behavior
>> if the code switches to the ``better'' refcount_t data structure.
>> 
>> I originally didn't use refcount_t because silently saturating and not
>> bothering to handle the error makes me uncomfortable.
>> 
>> Not having to acquire the ucounts_lock every time seems nice.  Perhaps
>> the path forward would be to start with stupid/correct code that always
>> takes the ucounts_lock for every increment of ucounts->count, that is
>> later replaced with something more optimal.
>> 
>> Not impacting performance in the non-namespace cases and having good
>> performance in the other cases is a fundamental requirement of merging
>> code like this.
>
> Did I understand your suggestion correctly that you suggest to use
> spin_lock for atomic_read and atomic_inc ?
>
> If so, then we are already incrementing the counter under ucounts_lock.
>
>   ...
>   if (atomic_read(>count) == INT_MAX)
>   ucounts = NULL;
>   else
>   atomic_inc(>count);
>   spin_unlock_irq(_lock);
>   return ucounts;
>
> something like this ?

Yes.  But without atomics.  Something a bit more like:
>   ...
>   if (ucounts->count == INT_MAX)
>   ucounts = NULL;
>   else
>   ucounts->count++;
>   spin_unlock_irq(_lock);
>   return ucounts;

I do believe at some point we will want to say using the spin_lock for
ucounts->count is cumbersome, and suboptimal and we want to change it to
get a better performing implementation.

Just for getting the semantics correct we should be able to use just
ucounts_lock for locking.  Then when everything is working we can
profile and optimize the code.

I just don't want figuring out what is needed to get hung up over little
details that we can change later.

Eric



[RFC][PATCH] apparmor: Enforce progressively tighter permissions for no_new_privs

2021-01-20 Thread Eric W. Biederman


The current understanding of apparmor with respect to no_new_privs is at
odds with how no_new_privs is implemented and understood by the rest of
the kernel.

The documentation of no_new_privs states:
> With ``no_new_privs`` set, ``execve()`` promises not to grant the
> privilege to do anything that could not have been done without the
> execve call.

And reading through the kernel except for apparmor that description
matches what is implemented.

There are two major divergences of apparmor from this definition:
- proc_setattr enforces limitations when no_new_privs are set.
- the limitation is enforced from the apparent time when no_new_privs is
  set instead of guaranteeing that each execve has progressively more
  narrow permissions.

The code in apparmor that attempts to discover the apparmor label at the
point where no_new_privs is set is not robust.  The capture happens a
long time after no_new_privs is set.

Capturing the label at the point where no_new_privs is set is
practically impossible to implement robustly.  Today the rule is struct
cred can only be changed by it's current task.  Today
SECCOMP_FILTER_FLAG_TSYNC sets no_new_privs from another thread.  A
robust implementation would require changing something fundamental in
how creds are managed for SECCOMP_FILTER_FLAG_TSYNC to be able to
capture the cred at the point it is set.

Futhermore given the consistent documentation and how everything else
implements no_new_privs, not having the permissions get progressively
tighter is a footgun aimed at userspace.  I fully expect it to break any
security sensitive software that uses no_new_privs and was not
deliberately designed and tested against apparmor.

Avoid the questionable and hard to fix implementation and the
potential to confuse userspace by having no_new_privs enforce
progressinvely tighter permissions.

Fixes: 9fcf78cca198 ("apparmor: update domain transitions that are subsets of 
confinement at nnp")
Signed-off-by: Eric W. Biederman 
---

I came accross this while examining the places cred_guard_mutex is
used and trying to find a way to make those code paths less insane.

If it would be more pallatable I would not mind removing the
task_no_new_privs test entirely from aa_change_hat and aa_change_profile
as those are not part of exec, so arguably no_new_privs should not care
about them at all.

Can we please get rid of the huge semantic wart and pain in the implementation?

 security/apparmor/domain.c   | 39 
 security/apparmor/include/task.h |  4 
 security/apparmor/task.c |  7 --
 3 files changed, 4 insertions(+), 46 deletions(-)

diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index f919ebd042fd..8f77059bf890 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -869,17 +869,6 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm)
 
label = aa_get_newest_label(cred_label(bprm->cred));
 
-   /*
-* Detect no new privs being set, and store the label it
-* occurred under. Ideally this would happen when nnp
-* is set but there isn't a good way to do that yet.
-*
-* Testing for unconfined must be done before the subset test
-*/
-   if ((bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS) && !unconfined(label) &&
-   !ctx->nnp)
-   ctx->nnp = aa_get_label(label);
-
/* buffer freed below, name is pointer into buffer */
buffer = aa_get_buffer(false);
if (!buffer) {
@@ -915,7 +904,7 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm)
 */
if ((bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS) &&
!unconfined(label) &&
-   !aa_label_is_unconfined_subset(new, ctx->nnp)) {
+   !aa_label_is_unconfined_subset(new, label)) {
error = -EPERM;
info = "no new privs";
goto audit;
@@ -1158,16 +1147,6 @@ int aa_change_hat(const char *hats[], int count, u64 
token, int flags)
label = aa_get_newest_cred_label(cred);
previous = aa_get_newest_label(ctx->previous);
 
-   /*
-* Detect no new privs being set, and store the label it
-* occurred under. Ideally this would happen when nnp
-* is set but there isn't a good way to do that yet.
-*
-* Testing for unconfined must be done before the subset test
-*/
-   if (task_no_new_privs(current) && !unconfined(label) && !ctx->nnp)
-   ctx->nnp = aa_get_label(label);
-
if (unconfined(label)) {
info = "unconfined can not change_hat";
error = -EPERM;
@@ -1193,7 +1172,7 @@ int aa_change_hat(const char *hats[], int count, u64 
token, int flags)
 * reduce restrictions.
 */
  

Re: [RFC][PATCH] apparmor: Enforce progressively tighter permissions for no_new_privs

2021-01-20 Thread Eric W. Biederman


TL;DR selinux and apparmor ignore no_new_privs

What?


John Johansen  writes:

> On 1/20/21 1:26 PM, Eric W. Biederman wrote:
>> 
>> The current understanding of apparmor with respect to no_new_privs is at
>> odds with how no_new_privs is implemented and understood by the rest of
>> the kernel.
>> 
>> The documentation of no_new_privs states:
>>> With ``no_new_privs`` set, ``execve()`` promises not to grant the
>>> privilege to do anything that could not have been done without the
>>> execve call.
>> 
>> And reading through the kernel except for apparmor that description
>> matches what is implemented.
>> 
>
> That is not correct.
>
> commit 7b0d0b40cd78 ("selinux: Permit bounded transitions under
> NO_NEW_PRIVS or NOSUID.")
>
> Allows for bound transitions under selinux
> and

As I understand a bound transition it is a transition to a state with
a set of permissions that are a subset of what was previously held.
Which is consistent with the mandate of no_new_privs.

> commit af63f4193f9f selinux: Generalize support for NNP/nosuid SELinux
> domain transitions
>
> goes further and "Decouple NNP/nosuid from SELinux transitions".

Yes.  Looking at that commit I do see that selinux appears to be
deliberately ignoring no_new_privs in specific cases.

WTF.

>> There are two major divergences of apparmor from this definition:
>> - proc_setattr enforces limitations when no_new_privs are set.
>> - the limitation is enforced from the apparent time when no_new_privs is
>>   set instead of guaranteeing that each execve has progressively more
>>   narrow permissions.
>> 
>> The code in apparmor that attempts to discover the apparmor label at the
>> point where no_new_privs is set is not robust.  The capture happens a
>> long time after no_new_privs is set.
>> 
>
> yes, but that shouldn't matter. As apparmor has not changed its label
> at any point between when no_new_privs was set and when the check is
> done. AppArmor is attempting to change it label, and if it finds NNP
> has been set we capture what the confinement was.
>
>> Capturing the label at the point where no_new_privs is set is
>> practically impossible to implement robustly.  Today the rule is struct
>> cred can only be changed by it's current task.  Today
>
> right, and apparmor only ever has the task update its own label.
>
>> SECCOMP_FILTER_FLAG_TSYNC sets no_new_privs from another thread.  A
>> robust implementation would require changing something fundamental in
>> how creds are managed for SECCOMP_FILTER_FLAG_TSYNC to be able to
>> capture the cred at the point it is set.
>> 
> I am open to supporting something like that.

I can't see how it would be possible to be robust without completely
changing the locking.  Locking that right now in a simpler model we have
not figured out how to make obviously correct.

>> Futhermore given the consistent documentation and how everything else
>> implements no_new_privs, not having the permissions get progressively
>
> Again see above

Except where selinux deliberately ignores no_new_privs this is
consitent.

>> tighter is a footgun aimed at userspace.  I fully expect it to break any
>
> tighter is somewhat relative, nor is it only progressively tighter it
> is bounded against the snapshot of the label that was on the task.

Which is the BUG I am reporting.  It should be progressingly tighter.

>> security sensitive software that uses no_new_privs and was not
>> deliberately designed and tested against apparmor.
>> 
>
> Currently the situation has become either an either or choice between
> the LSM and NNP. We are trying to walk a balance. Ideally apparmor
> would like to do something similar to selinux and decouple the label
> transition from NNP and nosuid via an internal capability, but we
> have not gone there yet.

Why do you need to escape no_new_privs.  Why does anyone need to escape
no_new_privs?

>> Avoid the questionable and hard to fix implementation and the
>> potential to confuse userspace by having no_new_privs enforce
>> progressinvely tighter permissions.
>> 
>
> This would completely break several use cases.

Enforcing no_new_privs as documented would break userspace?

Isn't the opposite true that you are breaking people by not enforcing
it?

>> Fixes: 9fcf78cca198 ("apparmor: update domain transitions that are subsets 
>> of confinement at nnp")
>> Signed-off-by: Eric W. Biederman 
>> ---
>> 
>> I came accross this while examining the places cred_guard_mutex is
>> used and trying to find a way to make those code paths less insane.
>> 
&g

Re: [PATCH v4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-20 Thread Eric Snowberg


> On Jan 20, 2021, at 4:26 AM, Jarkko Sakkinen  wrote:
> 
> On Fri, Jan 15, 2021 at 09:49:02AM -0700, Eric Snowberg wrote:
>> 
>>> On Jan 15, 2021, at 2:15 AM, Jarkko Sakkinen  wrote:
>>> 
>>> On Wed, Jan 13, 2021 at 05:11:10PM -0700, Eric Snowberg wrote:
>>>> 
>>>>> On Jan 13, 2021, at 1:41 PM, Jarkko Sakkinen 
>>>>>  wrote:
>>>>> 
>>>>> On Tue, Jan 12, 2021 at 02:57:39PM +, David Howells wrote:
>>>>>> Eric Snowberg  wrote:
>>>>>> 
>>>>>>>> On Dec 10, 2020, at 2:49 AM, David Howells  wrote:
>>>>>>>> 
>>>>>>>> Eric Snowberg  wrote:
>>>>>>>> 
>>>>>>>>> Add support for EFI_CERT_X509_GUID dbx entries. When a 
>>>>>>>>> EFI_CERT_X509_GUID
>>>>>>>>> is found, it is added as an asymmetrical key to the .blacklist 
>>>>>>>>> keyring.
>>>>>>>>> Anytime the .platform keyring is used, the keys in the .blacklist 
>>>>>>>>> keyring
>>>>>>>>> are referenced, if a matching key is found, the key will be rejected.
>>>>>>>> 
>>>>>>>> Ummm...  Why this way and not as a blacklist key which takes up less 
>>>>>>>> space?
>>>>>>>> I'm guessing that you're using the key chain matching logic.  We 
>>>>>>>> really only
>>>>>>>> need to blacklist the key IDs.
>>>>>>> 
>>>>>>> I implemented it this way so that certs in the dbx would only impact 
>>>>>>> the .platform keyring. I was under the impression we didn’t want to 
>>>>>>> have 
>>>>>>> Secure Boot UEFI db/dbx certs dictate keyring functionality within the 
>>>>>>> kernel
>>>>>>> itself. Meaning if we have a matching dbx cert in any other keyring 
>>>>>>> (builtin,
>>>>>>> secondary, ima, etc.), it would be allowed. If that is not how you’d 
>>>>>>> like to 
>>>>>>> see it done, let me know and I’ll make the change.
>>>>>> 
>>>>>> I wonder if that is that the right thing to do.  I guess this is a policy
>>>>>> decision and may depend on the particular user.
>>>>> 
>>>>> Why would you want to allow dbx entry in any keyring?
>>>> 
>>>> Today, DB and MOK certs go into the platform keyring.  These certs are only
>>>> referenced during kexec.  They can’t be used for other things like 
>>>> validating
>>>> kernel module signatures.  If we follow the same pattern, the DBX and MOKX 
>>>> entries
>>>> in the blacklist keyring should only impact kexec. 
>>>> 
>>>> Currently, Mickaël Salaün has another outstanding series to allow root to 
>>>> update 
>>>> the blacklist keyring.  I assume the use case for this is around 
>>>> certificates used 
>>>> within the kernel, for example revoking kernel module signatures.  The 
>>>> question I have
>>>> is, should another keyring be introduced?  One that carries DBX and MOKX, 
>>>> which just
>>>> correspond to certs/hashes in the platform keyring; this keyring would 
>>>> only be
>>>> referenced for kexec, just like the platform keyring is today. Then, the 
>>>> current
>>>> blacklist keyring would be used for everything internal to the kernel.
>>> 
>>> Right, I'm following actively that series.
>>> 
>>> Why couldn't user space drive this process and use that feature to do it?
>> 
>> I could see where the user would want to use both. With Mickaël Salaün’s
>> series, the blacklist keyring is updated immediately.  However it does
>> not survive a reboot.  With my patch, the blacklist keyring is updated
>> during boot, based on what is in the dbx. Neither approach needs a new 
>> kernel build.
> 
> I don't want to purposely challenge this, but why does it matter
> that it doesn't survive the boot? I'm referring here to the golden
> principle of kernel defining a mechanism, not policy. User space
> can do the population however it wants to for every boot.
> 
> E.g. systemd service could do this.
> 
> What am I missing here?

This change simply adds support for a missing type.  The kernel 
already supports cert and hash entries (EFI_CERT_X509_SHA256_GUID,
EFI_CERT_SHA256_GUID) that originate from the dbx and are loaded 
into the blacklist keyring during boot.  I’m not sure why a cert 
defined with EFI_CERT_X509_GUID should be handled in a different 
manner.

I suppose a user space tool could be created. But wouldn’t what is
currently done in the kernel in this area need to be removed?



Re: [RFC][PATCH] apparmor: Enforce progressively tighter permissions for no_new_privs

2021-01-20 Thread Eric W. Biederman


This should now Cc the correct email address for James Morris.

ebied...@xmission.com (Eric W. Biederman) writes:

> The current understanding of apparmor with respect to no_new_privs is at
> odds with how no_new_privs is implemented and understood by the rest of
> the kernel.
>
> The documentation of no_new_privs states:
>> With ``no_new_privs`` set, ``execve()`` promises not to grant the
>> privilege to do anything that could not have been done without the
>> execve call.
>
> And reading through the kernel except for apparmor that description
> matches what is implemented.
>
> There are two major divergences of apparmor from this definition:
> - proc_setattr enforces limitations when no_new_privs are set.
> - the limitation is enforced from the apparent time when no_new_privs is
>   set instead of guaranteeing that each execve has progressively more
>   narrow permissions.
>
> The code in apparmor that attempts to discover the apparmor label at the
> point where no_new_privs is set is not robust.  The capture happens a
> long time after no_new_privs is set.
>
> Capturing the label at the point where no_new_privs is set is
> practically impossible to implement robustly.  Today the rule is struct
> cred can only be changed by it's current task.  Today
> SECCOMP_FILTER_FLAG_TSYNC sets no_new_privs from another thread.  A
> robust implementation would require changing something fundamental in
> how creds are managed for SECCOMP_FILTER_FLAG_TSYNC to be able to
> capture the cred at the point it is set.
>
> Futhermore given the consistent documentation and how everything else
> implements no_new_privs, not having the permissions get progressively
> tighter is a footgun aimed at userspace.  I fully expect it to break any
> security sensitive software that uses no_new_privs and was not
> deliberately designed and tested against apparmor.
>
> Avoid the questionable and hard to fix implementation and the
> potential to confuse userspace by having no_new_privs enforce
> progressinvely tighter permissions.
>
> Fixes: 9fcf78cca198 ("apparmor: update domain transitions that are subsets of 
> confinement at nnp")
> Signed-off-by: Eric W. Biederman 
> ---
>
> I came accross this while examining the places cred_guard_mutex is
> used and trying to find a way to make those code paths less insane.
>
> If it would be more pallatable I would not mind removing the
> task_no_new_privs test entirely from aa_change_hat and aa_change_profile
> as those are not part of exec, so arguably no_new_privs should not care
> about them at all.
>
> Can we please get rid of the huge semantic wart and pain in the 
> implementation?
>
>  security/apparmor/domain.c   | 39 
>  security/apparmor/include/task.h |  4 
>  security/apparmor/task.c |  7 --
>  3 files changed, 4 insertions(+), 46 deletions(-)
>
> diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
> index f919ebd042fd..8f77059bf890 100644
> --- a/security/apparmor/domain.c
> +++ b/security/apparmor/domain.c
> @@ -869,17 +869,6 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm 
> *bprm)
>  
>   label = aa_get_newest_label(cred_label(bprm->cred));
>  
> - /*
> -  * Detect no new privs being set, and store the label it
> -  * occurred under. Ideally this would happen when nnp
> -  * is set but there isn't a good way to do that yet.
> -  *
> -  * Testing for unconfined must be done before the subset test
> -  */
> - if ((bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS) && !unconfined(label) &&
> - !ctx->nnp)
> - ctx->nnp = aa_get_label(label);
> -
>   /* buffer freed below, name is pointer into buffer */
>   buffer = aa_get_buffer(false);
>   if (!buffer) {
> @@ -915,7 +904,7 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm 
> *bprm)
>*/
>   if ((bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS) &&
>   !unconfined(label) &&
> - !aa_label_is_unconfined_subset(new, ctx->nnp)) {
> + !aa_label_is_unconfined_subset(new, label)) {
>   error = -EPERM;
>   info = "no new privs";
>   goto audit;
> @@ -1158,16 +1147,6 @@ int aa_change_hat(const char *hats[], int count, u64 
> token, int flags)
>   label = aa_get_newest_cred_label(cred);
>   previous = aa_get_newest_label(ctx->previous);
>  
> - /*
> -  * Detect no new privs being set, and store the label it
> -  * occurred under. Ideally this would happen when nnp
> -  * is set but there isn't a good way to do that yet.
> - 

Re: [PATCH] Increase limit of max_user_watches from 1/25 to 1/16

2021-01-20 Thread Eric Curtin
On Wed, 20 Jan 2021 at 13:02, Eric Curtin  wrote:
>
> The current default value for  max_user_watches  is the 1/16 (6.25%) of
> the available low memory, divided for the "watch" cost in bytes.
>
> Tools like inotify-tools and visual studio code, seem to hit these
> limits a little to easy.
>
> Also amending the documentation, it referred to an old value for this.
>
> Signed-off-by: Eric Curtin 
> ---
>  Documentation/admin-guide/sysctl/fs.rst | 4 ++--
>  fs/eventpoll.c  | 4 ++--
>  2 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/admin-guide/sysctl/fs.rst 
> b/Documentation/admin-guide/sysctl/fs.rst
> index f48277a0a850..f7fe45e69c41 100644
> --- a/Documentation/admin-guide/sysctl/fs.rst
> +++ b/Documentation/admin-guide/sysctl/fs.rst
> @@ -380,5 +380,5 @@ This configuration option sets the maximum number of 
> "watches" that are
>  allowed for each user.
>  Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
>  on a 64bit one.
> -The current default value for  max_user_watches  is the 1/32 of the available
> -low memory, divided for the "watch" cost in bytes.
> +The current default value for  max_user_watches  is the 1/16 (6.25%) of the
> +available low memory, divided for the "watch" cost in bytes.
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index a829af074eb5..de9ef8f6d0b2 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -2352,9 +2352,9 @@ static int __init eventpoll_init(void)
>
> si_meminfo();
> /*
> -* Allows top 4% of lomem to be allocated for epoll watches (per 
> user).
> +* Allows top 6.25% of lomem to be allocated for epoll watches (per 
> user).
>  */
> -   max_user_watches = (((si.totalram - si.totalhigh) / 25) << 
> PAGE_SHIFT) /
> +   max_user_watches = (((si.totalram - si.totalhigh) / 16) << 
> PAGE_SHIFT) /
> EP_ITEM_COST;
> BUG_ON(max_user_watches < 0);
>
> --
> 2.25.1
>

Please ignore this, this is the wrong limit (an epoll one), I sent
another patch just
to update the documentation to be correct. Weiman Long already kindly solved the
issue in 92890123749bafc317bbfacbe0a62ce08d78efb7

Separate patch is titled "[PATCH] Update
Documentation/admin-guide/sysctl/fs.rst"


[PATCH] Increase limit of max_user_watches from 1/25 to 1/16

2021-01-20 Thread Eric Curtin
The current default value for  max_user_watches  is the 1/16 (6.25%) of
the available low memory, divided for the "watch" cost in bytes.

Tools like inotify-tools and visual studio code, seem to hit these
limits a little to easy.

Also amending the documentation, it referred to an old value for this.

Signed-off-by: Eric Curtin 
---
 Documentation/admin-guide/sysctl/fs.rst | 4 ++--
 fs/eventpoll.c  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/fs.rst 
b/Documentation/admin-guide/sysctl/fs.rst
index f48277a0a850..f7fe45e69c41 100644
--- a/Documentation/admin-guide/sysctl/fs.rst
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -380,5 +380,5 @@ This configuration option sets the maximum number of 
"watches" that are
 allowed for each user.
 Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
 on a 64bit one.
-The current default value for  max_user_watches  is the 1/32 of the available
-low memory, divided for the "watch" cost in bytes.
+The current default value for  max_user_watches  is the 1/16 (6.25%) of the
+available low memory, divided for the "watch" cost in bytes.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a829af074eb5..de9ef8f6d0b2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2352,9 +2352,9 @@ static int __init eventpoll_init(void)
 
si_meminfo();
/*
-* Allows top 4% of lomem to be allocated for epoll watches (per user).
+* Allows top 6.25% of lomem to be allocated for epoll watches (per 
user).
 */
-   max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
+   max_user_watches = (((si.totalram - si.totalhigh) / 16) << PAGE_SHIFT) /
EP_ITEM_COST;
BUG_ON(max_user_watches < 0);
 
-- 
2.25.1



[PATCH] Update Documentation/admin-guide/sysctl/fs.rst

2021-01-20 Thread Eric Curtin
max_user_watches for epoll should say 1/25, rather than 1/32

Signed-off-by: Eric Curtin 
---
 Documentation/admin-guide/sysctl/fs.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/fs.rst 
b/Documentation/admin-guide/sysctl/fs.rst
index f48277a0a850..2a501c9ddc55 100644
--- a/Documentation/admin-guide/sysctl/fs.rst
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -380,5 +380,5 @@ This configuration option sets the maximum number of 
"watches" that are
 allowed for each user.
 Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
 on a 64bit one.
-The current default value for  max_user_watches  is the 1/32 of the available
-low memory, divided for the "watch" cost in bytes.
+The current default value for  max_user_watches  is the 1/25 (4%) of the
+available low memory, divided for the "watch" cost in bytes.
-- 
2.25.1



Re: [PATCH net] tcp: Fix potential use-after-free due to double kfree().

2021-01-20 Thread Eric Dumazet
On Wed, Jan 20, 2021 at 2:17 AM Jakub Kicinski  wrote:
>
> On Mon, 18 Jan 2021 14:59:20 +0900 Kuniyuki Iwashima wrote:
> > Receiving ACK with a valid SYN cookie, cookie_v4_check() allocates struct
> > request_sock and then can allocate inet_rsk(req)->ireq_opt. After that,
> > tcp_v4_syn_recv_sock() allocates struct sock and copies ireq_opt to
> > inet_sk(sk)->inet_opt. Normally, tcp_v4_syn_recv_sock() inserts the full
> > socket into ehash and sets NULL to ireq_opt. Otherwise,
> > tcp_v4_syn_recv_sock() has to reset inet_opt by NULL and free the full
> > socket.
> >
> > The commit 01770a1661657 ("tcp: fix race condition when creating child
> > sockets from syncookies") added a new path, in which more than one cores
> > create full sockets for the same SYN cookie. Currently, the core which
> > loses the race frees the full socket without resetting inet_opt, resulting
> > in that both sock_put() and reqsk_put() call kfree() for the same memory:
> >
> >   sock_put
> > sk_free
> >   __sk_free
> > sk_destruct
> >   __sk_destruct
> > sk->sk_destruct/inet_sock_destruct
> >   kfree(rcu_dereference_protected(inet->inet_opt, 1));
> >
> >   reqsk_put
> > reqsk_free
> >   __reqsk_free
> > req->rsk_ops->destructor/tcp_v4_reqsk_destructor
> >   kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
> >
> > Calling kmalloc() between the double kfree() can lead to use-after-free, so
> > this patch fixes it by setting NULL to inet_opt before sock_put().
> >
> > As a side note, this kind of issue does not happen for IPv6. This is
> > because tcp_v6_syn_recv_sock() clones both ipv6_opt and pktopts which
> > correspond to ireq_opt in IPv4.
> >
> > Fixes: 01770a166165 ("tcp: fix race condition when creating child sockets 
> > from syncookies")
> > CC: Ricardo Dias 
> > Signed-off-by: Kuniyuki Iwashima 
> > Reviewed-by: Benjamin Herrenschmidt 
>
> Ricardo, Eric, any reason this was written this way?

Well, I guess that was a plain bug.

IPv4 options are not used often I think.

Reviewed-by: Eric Dumazet 


Re: [RFC v3 2/2] vfio/platform: msi: add Broadcom platform devices

2021-01-20 Thread Auger Eric
Hi Alex,

On 1/19/21 11:45 PM, Alex Williamson wrote:
> On Fri, 15 Jan 2021 10:24:33 +0100
> Auger Eric  wrote:
> 
>> Hi Vikas,
>> On 1/15/21 7:35 AM, Vikas Gupta wrote:
>>> Hi Eric,
>>>
>>> On Tue, Jan 12, 2021 at 2:52 PM Auger Eric  wrote:  
>>>>
>>>> Hi Vikas,
>>>>
>>>> On 12/14/20 6:45 PM, Vikas Gupta wrote:  
>>>>> Add msi support for Broadcom platform devices
>>>>>
>>>>> Signed-off-by: Vikas Gupta 
>>>>> ---
>>>>>  drivers/vfio/platform/Kconfig |  1 +
>>>>>  drivers/vfio/platform/Makefile|  1 +
>>>>>  drivers/vfio/platform/msi/Kconfig |  9 
>>>>>  drivers/vfio/platform/msi/Makefile|  2 +
>>>>>  .../vfio/platform/msi/vfio_platform_bcmplt.c  | 49 +++
>>>>>  5 files changed, 62 insertions(+)
>>>>>  create mode 100644 drivers/vfio/platform/msi/Kconfig
>>>>>  create mode 100644 drivers/vfio/platform/msi/Makefile
>>>>>  create mode 100644 drivers/vfio/platform/msi/vfio_platform_bcmplt.c  
>>>> what does plt mean?  
>>> This(plt) is a generic name for Broadcom platform devices, which we`ll
>>>  plan to add in this file. Currently we have only one in this file.
>>> Do you think this name does not sound good here?  
>>
>> we have VFIO_PLATFORM_BCMFLEXRM_RESET config which also applied to vfio
>> flex-rm platform device.
>>
>> I think it would be more homegenous to have VFIO_PLATFORM_BCMFLEXRM_MSI
>> in case we keep a separate msi module.
>>
>> also in reset dir we have vfio_platform_bcmflexrm.c
>>
>>
>>>>>
>>>>> diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
>>>>> index dc1a3c44f2c6..7b8696febe61 100644
>>>>> --- a/drivers/vfio/platform/Kconfig
>>>>> +++ b/drivers/vfio/platform/Kconfig
>>>>> @@ -21,3 +21,4 @@ config VFIO_AMBA
>>>>> If you don't know what to do here, say N.
>>>>>
>>>>>  source "drivers/vfio/platform/reset/Kconfig"
>>>>> +source "drivers/vfio/platform/msi/Kconfig"
>>>>> diff --git a/drivers/vfio/platform/Makefile 
>>>>> b/drivers/vfio/platform/Makefile
>>>>> index 3f3a24e7c4ef..9ccdcdbf0e7e 100644
>>>>> --- a/drivers/vfio/platform/Makefile
>>>>> +++ b/drivers/vfio/platform/Makefile
>>>>> @@ -5,6 +5,7 @@ vfio-platform-y := vfio_platform.o
>>>>>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
>>>>>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform-base.o
>>>>>  obj-$(CONFIG_VFIO_PLATFORM) += reset/
>>>>> +obj-$(CONFIG_VFIO_PLATFORM) += msi/
>>>>>
>>>>>  vfio-amba-y := vfio_amba.o
>>>>>
>>>>> diff --git a/drivers/vfio/platform/msi/Kconfig 
>>>>> b/drivers/vfio/platform/msi/Kconfig
>>>>> new file mode 100644
>>>>> index ..54d6b70e1e32
>>>>> --- /dev/null
>>>>> +++ b/drivers/vfio/platform/msi/Kconfig
>>>>> @@ -0,0 +1,9 @@
>>>>> +# SPDX-License-Identifier: GPL-2.0-only
>>>>> +config VFIO_PLATFORM_BCMPLT_MSI
>>>>> + tristate "MSI support for Broadcom platform devices"
>>>>> + depends on VFIO_PLATFORM && (ARCH_BCM_IPROC || COMPILE_TEST)
>>>>> + default ARCH_BCM_IPROC
>>>>> + help
>>>>> +   Enables the VFIO platform driver to handle msi for Broadcom 
>>>>> devices
>>>>> +
>>>>> +   If you don't know what to do here, say N.
>>>>> diff --git a/drivers/vfio/platform/msi/Makefile 
>>>>> b/drivers/vfio/platform/msi/Makefile
>>>>> new file mode 100644
>>>>> index ..27422d45cecb
>>>>> --- /dev/null
>>>>> +++ b/drivers/vfio/platform/msi/Makefile
>>>>> @@ -0,0 +1,2 @@
>>>>> +# SPDX-License-Identifier: GPL-2.0
>>>>> +obj-$(CONFIG_VFIO_PLATFORM_BCMPLT_MSI) += vfio_platform_bcmplt.o
>>>>> diff --git a/drivers/vfio/platform/msi/vfio_platform_bcmplt.c 
>>>>> b/drivers/vfio/platform/msi/vfio_platform_bcmplt.c
>>>>> new file mode 100644
>>>>> index ..a074b5e92d77
>>>>> --- /dev/null
>>>>> +++ b/

Re: [dm-devel] [PATCH AUTOSEL 5.4 03/26] dm integrity: select CRYPTO_SKCIPHER

2021-01-19 Thread Eric Biggers
On Tue, Jan 19, 2021 at 08:26:40PM -0500, Sasha Levin wrote:
> From: Anthony Iliopoulos 
> 
> [ Upstream commit f7b347acb5f6c29d9229bb64893d8b6a2c7949fb ]
> 
> The integrity target relies on skcipher for encryption/decryption, but
> certain kernel configurations may not enable CRYPTO_SKCIPHER, leading to
> compilation errors due to unresolved symbols. Explicitly select
> CRYPTO_SKCIPHER for DM_INTEGRITY, since it is unconditionally dependent
> on it.
> 
> Signed-off-by: Anthony Iliopoulos 
> Signed-off-by: Mike Snitzer 
> Signed-off-by: Sasha Levin 
> ---
>  drivers/md/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
> index aa98953f4462e..7dd6e98257c72 100644
> --- a/drivers/md/Kconfig
> +++ b/drivers/md/Kconfig
> @@ -565,6 +565,7 @@ config DM_INTEGRITY
>   select BLK_DEV_INTEGRITY
>   select DM_BUFIO
>   select CRYPTO
> + select CRYPTO_SKCIPHER
>   select ASYNC_XOR
>   ---help---
> This device-mapper target emulates a block device that has

CRYPTO_SKCIPHER doesn't exist in 5.4 and earlier because it was renamed from
CRYPTO_BLKCIPHER in 5.5.  If this patch is really important enough to backport,
CRYPTO_SKCIPHER will need to be changed to CRYPTO_BLKCIPHER.

- Eric


Re: [RFC PATCH v3 1/8] Use refcount_t for ucounts reference counting

2021-01-19 Thread Eric W. Biederman
ebied...@xmission.com (Eric W. Biederman) writes:

> Alexey Gladkov  writes:
>
>> On Mon, Jan 18, 2021 at 12:34:29PM -0800, Linus Torvalds wrote:
>>> On Mon, Jan 18, 2021 at 11:46 AM Alexey Gladkov
>>>  wrote:
>>> >
>>> > Sorry about that. I thought that this code is not needed when switching
>>> > from int to refcount_t. I was wrong.
>>> 
>>> Well, you _may_ be right. I personally didn't check how the return
>>> value is used.
>>> 
>>> I only reacted to "it certainly _may_ be used, and there is absolutely
>>> no comment anywhere about why it wouldn't matter".
>>
>> I have not found examples where checked the overflow after calling
>> refcount_inc/refcount_add.
>>
>> For example in kernel/fork.c:2298 :
>>
>>current->signal->nr_threads++;   
>>atomic_inc(>signal->live);  
>>refcount_inc(>signal->sigcnt);  
>>
>> $ semind search signal_struct.sigcnt
>> def include/linux/sched/signal.h:83  refcount_t  
>> sigcnt;
>> m-- kernel/fork.c:723 put_signal_struct  if 
>> (refcount_dec_and_test(>sigcnt))
>> m-- kernel/fork.c:1571 copy_signal   refcount_set(>sigcnt, 1);
>> m-- kernel/fork.c:2298 copy_process  
>> refcount_inc(>signal->sigcnt);
>>
>> It seems to me that the only way is to use __refcount_inc and then compare
>> the old value with REFCOUNT_MAX
>>
>> Since I have not seen examples of such checks, I thought that this is
>> acceptable. Sorry once again. I have not tried to hide these changes.
>
> The current ucount code does check for overflow and fails the increment
> in every case.
>
> So arguably it will be a regression and inferior error handling behavior
> if the code switches to the ``better'' refcount_t data structure.
>
> I originally didn't use refcount_t because silently saturating and not
> bothering to handle the error makes me uncomfortable.
>
> Not having to acquire the ucounts_lock every time seems nice.  Perhaps
> the path forward would be to start with stupid/correct code that always
> takes the ucounts_lock for every increment of ucounts->count, that is
> later replaced with something more optimal.
>
> Not impacting performance in the non-namespace cases and having good
> performance in the other cases is a fundamental requirement of merging
> code like this.

So starting with something easy to comprehend and simple, may make it
easier to figure out how to optimize the code.

Eric



Re: [RFC PATCH v3 1/8] Use refcount_t for ucounts reference counting

2021-01-19 Thread Eric W. Biederman
Alexey Gladkov  writes:

> On Mon, Jan 18, 2021 at 12:34:29PM -0800, Linus Torvalds wrote:
>> On Mon, Jan 18, 2021 at 11:46 AM Alexey Gladkov
>>  wrote:
>> >
>> > Sorry about that. I thought that this code is not needed when switching
>> > from int to refcount_t. I was wrong.
>> 
>> Well, you _may_ be right. I personally didn't check how the return
>> value is used.
>> 
>> I only reacted to "it certainly _may_ be used, and there is absolutely
>> no comment anywhere about why it wouldn't matter".
>
> I have not found examples where checked the overflow after calling
> refcount_inc/refcount_add.
>
> For example in kernel/fork.c:2298 :
>
>current->signal->nr_threads++;   
>atomic_inc(>signal->live);  
>refcount_inc(>signal->sigcnt);  
>
> $ semind search signal_struct.sigcnt
> def include/linux/sched/signal.h:83   refcount_t  sigcnt;
> m-- kernel/fork.c:723 put_signal_struct   if 
> (refcount_dec_and_test(>sigcnt))
> m-- kernel/fork.c:1571 copy_signalrefcount_set(>sigcnt, 1);
> m-- kernel/fork.c:2298 copy_process   
> refcount_inc(>signal->sigcnt);
>
> It seems to me that the only way is to use __refcount_inc and then compare
> the old value with REFCOUNT_MAX
>
> Since I have not seen examples of such checks, I thought that this is
> acceptable. Sorry once again. I have not tried to hide these changes.

The current ucount code does check for overflow and fails the increment
in every case.

So arguably it will be a regression and inferior error handling behavior
if the code switches to the ``better'' refcount_t data structure.

I originally didn't use refcount_t because silently saturating and not
bothering to handle the error makes me uncomfortable.

Not having to acquire the ucounts_lock every time seems nice.  Perhaps
the path forward would be to start with stupid/correct code that always
takes the ucounts_lock for every increment of ucounts->count, that is
later replaced with something more optimal.

Not impacting performance in the non-namespace cases and having good
performance in the other cases is a fundamental requirement of merging
code like this.

Eric



Re: [PATCH 2/2] security.capability: fix conversions on getxattr

2021-01-19 Thread Eric W. Biederman
Miklos Szeredi  writes:

> If a capability is stored on disk in v2 format cap_inode_getsecurity() will
> currently return in v2 format unconditionally.
>
> This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid,
> and so the same conversions performed on it.
>
> If the rootid cannot be mapped v3 is returned unconverted.  Fix this so
> that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs
> user namespace in case of v2) cannot be mapped in the current user
> namespace.

This looks like a good cleanup.

I do wonder how well this works with stacking.  In particular
ovl_xattr_set appears to call vfs_getxattr without overriding the creds.
What the purpose of that is I haven't quite figured out.  It looks like
it is just a probe to see if an xattr is present so maybe it is ok.

Acked-by: "Eric W. Biederman" 

>
> Signed-off-by: Miklos Szeredi 
> ---
>  security/commoncap.c | 67 
>  1 file changed, 43 insertions(+), 24 deletions(-)
>
> diff --git a/security/commoncap.c b/security/commoncap.c
> index baccd871..c9d99f8f4c82 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -371,10 +371,11 @@ int cap_inode_getsecurity(struct inode *inode, const 
> char *name, void **buffer,
>  {
>   int size, ret;
>   kuid_t kroot;
> + __le32 nsmagic, magic;
>   uid_t root, mappedroot;
>   char *tmpbuf = NULL;
>   struct vfs_cap_data *cap;
> - struct vfs_ns_cap_data *nscap;
> + struct vfs_ns_cap_data *nscap = NULL;
>   struct dentry *dentry;
>   struct user_namespace *fs_ns;
>  
> @@ -396,46 +397,61 @@ int cap_inode_getsecurity(struct inode *inode, const 
> char *name, void **buffer,
>   fs_ns = inode->i_sb->s_user_ns;
>   cap = (struct vfs_cap_data *) tmpbuf;
>   if (is_v2header((size_t) ret, cap)) {
> - /* If this is sizeof(vfs_cap_data) then we're ok with the
> -  * on-disk value, so return that.  */
> - if (alloc)
> - *buffer = tmpbuf;
> - else
> - kfree(tmpbuf);
> - return ret;
> - } else if (!is_v3header((size_t) ret, cap)) {
> - kfree(tmpbuf);
> - return -EINVAL;
> + root = 0;
> + } else if (is_v3header((size_t) ret, cap)) {
> + nscap = (struct vfs_ns_cap_data *) tmpbuf;
> + root = le32_to_cpu(nscap->rootid);
> + } else {
> + size = -EINVAL;
> + goto out_free;
>   }
>  
> - nscap = (struct vfs_ns_cap_data *) tmpbuf;
> - root = le32_to_cpu(nscap->rootid);
>   kroot = make_kuid(fs_ns, root);
>  
>   /* If the root kuid maps to a valid uid in current ns, then return
>* this as a nscap. */
>   mappedroot = from_kuid(current_user_ns(), kroot);
>   if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
> + size = sizeof(struct vfs_ns_cap_data);
>   if (alloc) {
> - *buffer = tmpbuf;
> + if (!nscap) {
> + /* v2 -> v3 conversion */
> + nscap = kzalloc(size, GFP_ATOMIC);
> + if (!nscap) {
> + size = -ENOMEM;
> + goto out_free;
> + }
> + nsmagic = VFS_CAP_REVISION_3;
> + magic = le32_to_cpu(cap->magic_etc);
> + if (magic & VFS_CAP_FLAGS_EFFECTIVE)
> + nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
> + memcpy(>data, >data, sizeof(__le32) 
> * 2 * VFS_CAP_U32);
> + nscap->magic_etc = cpu_to_le32(nsmagic);
> + } else {
> + /* use allocated v3 buffer */
> + tmpbuf = NULL;
> + }
>   nscap->rootid = cpu_to_le32(mappedroot);
> - } else
> - kfree(tmpbuf);
> - return size;
> + *buffer = nscap;
> + }
> + goto out_free;
>   }
>  
>   if (!rootid_owns_currentns(kroot)) {
> - kfree(tmpbuf);
> - return -EOPNOTSUPP;
> + size = -EOVERFLOW;
> + goto out_free;
>   }
>  
>   /* This comes from a parent namespace.  Return as a v2 capability */
>   size = sizeof(struct vfs_cap_data);
>   if (alloc) {
> - *buffer = 

Re: [PATCH 0/2] capability conversion fixes

2021-01-19 Thread Eric W. Biederman
Miklos Szeredi  writes:

> It turns out overlayfs is actually okay wrt. mutliple conversions, because
> it uses the right context for lower operations.  I.e. before calling
> vfs_{set,get}xattr() on underlying fs, it overrides creds with that of the
> mounter, so the current user ns will now match that of
> overlay_sb->s_user_ns, meaning that the caps will be converted to just the
> right format for the next layer
>
> OTOH ecryptfs, which is the only other one affected by commit 7c03e2cda4a5
> ("vfs: move cap_convert_nscap() call into vfs_setxattr()") needs to be
> fixed up, since it doesn't do the cap override thing that overlayfs does.
>
> I don't have an ecryptfs setup, so untested, but it's a fairly trivial
> change.
>
> My other observation was that cap_inode_getsecurity() messes up conversion
> of caps in more than one case.  This is independent of the overlayfs user
> ns enablement but affects it as well.
>
> Maybe we can revisit the infrastructure improvements we discussed, but I
> think these fixes are more appropriate for the current cycle.

I mostly agree.  Fixing the bugs in a back-portable way is important.

However we need to sort out the infrastructure, and implementation.

As far as I can tell it is only the fact that overlayfs does not support
the new mount api aka fs_context that allows this fix to work and be
correct.

I believe the new mount api would allow specifying a different userns
thatn curent_user_ns for the overlay filesystem and that would break
this.

So while I agree with the making a minimal fix for now.  We need a good
fix because this code is much too subtle, and it can break very easily
with no one noticing.

Eric





> Thanks,
> Miklos
>
> Miklos Szeredi (2):
>   ecryptfs: fix uid translation for setxattr on security.capability
>   security.capability: fix conversions on getxattr
>
>  fs/ecryptfs/inode.c  | 10 +--
>  security/commoncap.c | 67 
>  2 files changed, 50 insertions(+), 27 deletions(-)


Re: [PATCH 1/2] ecryptfs: fix uid translation for setxattr on security.capability

2021-01-19 Thread Eric W. Biederman
Miklos Szeredi  writes:

> Prior to commit 7c03e2cda4a5 ("vfs: move cap_convert_nscap() call into
> vfs_setxattr()") the translation of nscap->rootid did not take stacked
> filesystems (overlayfs and ecryptfs) into account.
>
> That patch fixed the overlay case, but made the ecryptfs case worse.
>
> Restore old the behavior for ecryptfs that existed before the overlayfs
> fix.  This does not fix ecryptfs's handling of complex user namespace
> setups, but it does make sure existing setups don't regress.

Today vfs_setxattr handles handles a delegated_inode and breaking
leases.  Code that is enabled with CONFIG_FILE_LOCKING.  So unless
I am missing something this introduces a different regression into
ecryptfs.

>
> Reported-by: Eric W. Biederman 
> Cc: Tyler Hicks 
> Fixes: 7c03e2cda4a5 ("vfs: move cap_convert_nscap() call into vfs_setxattr()")
> Signed-off-by: Miklos Szeredi 
> ---
>  fs/ecryptfs/inode.c | 10 +++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
> index e23752d9a79f..58d0f7187997 100644
> --- a/fs/ecryptfs/inode.c
> +++ b/fs/ecryptfs/inode.c
> @@ -1016,15 +1016,19 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode 
> *inode,
>  {
>   int rc;
>   struct dentry *lower_dentry;
> + struct inode *lower_inode;
>  
>   lower_dentry = ecryptfs_dentry_to_lower(dentry);
> - if (!(d_inode(lower_dentry)->i_opflags & IOP_XATTR)) {
> + lower_inode = d_inode(lower_dentry);
> + if (!(lower_inode->i_opflags & IOP_XATTR)) {
>   rc = -EOPNOTSUPP;
>   goto out;
>   }
> - rc = vfs_setxattr(lower_dentry, name, value, size, flags);
> + inode_lock(lower_inode);
> + rc = __vfs_setxattr_locked(lower_dentry, name, value, size, flags, 
> NULL);
> + inode_unlock(lower_inode);
>   if (!rc && inode)
> - fsstack_copy_attr_all(inode, d_inode(lower_dentry));
> + fsstack_copy_attr_all(inode, lower_inode);
>  out:
>   return rc;
>  }

Eric


[PATCH] rename lpfc_sli4_dump_page_a0 to lpfc_sli4_dump_sfp_pagea0

2021-01-19 Thread Eric Curtin
Comment did not match function signature.

Signed-off-by: Eric Curtin 
---
 drivers/scsi/lpfc/lpfc_mbox.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/lpfc/lpfc_mbox.c b/drivers/scsi/lpfc/lpfc_mbox.c
index 3414ffcb26fe..c03a7f12dd65 100644
--- a/drivers/scsi/lpfc/lpfc_mbox.c
+++ b/drivers/scsi/lpfc/lpfc_mbox.c
@@ -2409,7 +2409,7 @@ lpfc_mbx_cmpl_rdp_page_a0(struct lpfc_hba *phba, 
LPFC_MBOXQ_t *mbox)
 
 
 /*
- * lpfc_sli4_dump_sfp_pagea0 - Dump sli4 read SFP Diagnostic.
+ * lpfc_sli4_dump_page_a0 - Dump sli4 read SFP Diagnostic.
  * @phba: pointer to the hba structure containing.
  * @mbox: pointer to lpfc mbox command to initialize.
  *
-- 
2.25.1



Re: [PATCH RFC v1 00/15] iommu/virtio: Nested stage support with Arm

2021-01-19 Thread Auger Eric
Hi Vivek,

On 1/15/21 1:13 PM, Vivek Gautam wrote:
> This patch-series aims at enabling Nested stage translation in guests
> using virtio-iommu as the paravirtualized iommu. The backend is supported
> with Arm SMMU-v3 that provides nested stage-1 and stage-2 translation.
> 
> This series derives its purpose from various efforts happening to add
> support for Shared Virtual Addressing (SVA) in host and guest. On Arm,
> most of the support for SVA has already landed. The support for nested
> stage translation and fault reporting to guest has been proposed [1].
> The related changes required in VFIO [2] framework have also been put
> forward.
> 
> This series proposes changes in virtio-iommu to program PASID tables
> and related stage-1 page tables. A simple iommu-pasid-table library
> is added for this purpose that interacts with vendor drivers to
> allocate and populate PASID tables.
> In Arm SMMUv3 we propose to pull the Context Descriptor (CD) management
> code out of the arm-smmu-v3 driver and add that as a glue vendor layer
> to support allocating CD tables, and populating them with right values.
> These CD tables are essentially the PASID tables and contain stage-1
> page table configurations too.
> A request to setup these CD tables come from virtio-iommu driver using
> the iommu-pasid-table library when running on Arm. The virtio-iommu
> then pass these PASID tables to the host using the right virtio backend
> and support in VMM.
> 
> For testing we have added necessary support in kvmtool. The changes in
> kvmtool are based on virtio-iommu development branch by Jean-Philippe
> Brucker [3].
> 
> The tested kernel branch contains following in the order bottom to top
> on the git hash -
> a) v5.11-rc3
> b) arm-smmu-v3 [1] and vfio [2] changes from Eric to add nested page
>table support for Arm.
> c) Smmu test engine patches from Jean-Philippe's branch [4]
> d) This series
> e) Domain nesting info patches [5][6][7].
> f) Changes to add arm-smmu-v3 specific nesting info (to be sent to
>the list).
> 
> This kernel is tested on Neoverse reference software stack with
> Fixed virtual platform. Public version of the software stack and
> FVP is available here[8][9].
> 
> A big thanks to Jean-Philippe for his contributions towards this work
> and for his valuable guidance.
> 
> [1] 
> https://lore.kernel.org/linux-iommu/20201118112151.25412-1-eric.au...@redhat.com/T/
> [2] 
> https://lore.kernel.org/kvmarm/20201116110030.32335-12-eric.au...@redhat.com/T/
> [3] https://jpbrucker.net/git/kvmtool/log/?h=virtio-iommu/devel
> [4] https://jpbrucker.net/git/linux/log/?h=sva/smmute
> [5] 
> https://lore.kernel.org/kvm/1599734733-6431-2-git-send-email-yi.l@intel.com/
> [6] 
> https://lore.kernel.org/kvm/1599734733-6431-3-git-send-email-yi.l@intel.com/
> [7] 
> https://lore.kernel.org/kvm/1599734733-6431-4-git-send-email-yi.l@intel.com/
> [8] 
> https://developer.arm.com/tools-and-software/open-source-software/arm-platforms-software/arm-ecosystem-fvps
> [9] 
> https://git.linaro.org/landing-teams/working/arm/arm-reference-platforms.git/about/docs/rdn1edge/user-guide.rst

Could you share a public branch where we could find all the kernel pieces.

Thank you in advance

Best Regards

Eric
> 
> Jean-Philippe Brucker (6):
>   iommu/virtio: Add headers for table format probing
>   iommu/virtio: Add table format probing
>   iommu/virtio: Add headers for binding pasid table in iommu
>   iommu/virtio: Add support for INVALIDATE request
>   iommu/virtio: Attach Arm PASID tables when available
>   iommu/virtio: Add support for Arm LPAE page table format
> 
> Vivek Gautam (9):
>   iommu/arm-smmu-v3: Create a Context Descriptor library
>   iommu: Add a simple PASID table library
>   iommu/arm-smmu-v3: Update drivers to work with iommu-pasid-table
>   iommu/arm-smmu-v3: Update CD base address info for user-space
>   iommu/arm-smmu-v3: Set sync op from consumer driver of cd-lib
>   iommu: Add asid_bits to arm smmu-v3 stage1 table info
>   iommu/virtio: Update table format probing header
>   iommu/virtio: Prepare to add attach pasid table infrastructure
>   iommu/virtio: Update fault type and reason info for viommu fault
> 
>  drivers/iommu/arm/arm-smmu-v3/Makefile|   2 +-
>  .../arm/arm-smmu-v3/arm-smmu-v3-cd-lib.c  | 283 +++
>  .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |  16 +-
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 268 +--
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |   4 +-
>  drivers/iommu/iommu-pasid-table.h | 140 
>  drivers/iommu/virtio-iommu.c  | 692 +-
>  include/uapi/linux/iommu.h|   2 +-
>  include/uapi/linux/virtio_iommu.h | 158 +++-
>  9 files changed, 1303 insertions(+), 262 deletions(-)
>  create mode 100644 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-cd-lib.c
>  create mode 100644 drivers/iommu/iommu-pasid-table.h
> 



Re: [RFC V1 3/7] crypto: ghash - Optimized GHASH computations

2021-01-15 Thread Eric Biggers
On Fri, Jan 15, 2021 at 04:20:44PM -0800, Dave Hansen wrote:
> On 1/15/21 4:14 PM, Dey, Megha wrote:
> > Also, I do not know of any cores that implement PCLMULQDQ and not AES-NI.
> 
> That's true, bit it's also possible that a hypervisor could enumerate
> support for PCLMULQDQ and not AES-NI.  In general, we've tried to
> implement x86 CPU features independently, even if they never show up in
> a real CPU independently.

We only add optimized implementations of crypto algorithms if they are actually
useful, though.  If they would never be used in practice, that's not useful.

- Eric


Re: [RFC V1 3/7] crypto: ghash - Optimized GHASH computations

2021-01-15 Thread Eric Biggers
On Fri, Jan 15, 2021 at 04:14:40PM -0800, Dey, Megha wrote:
> > Hello Megha,
> > 
> > What is the purpose of this separate GHASH module? GHASH is only used
> > in combination with AES-CTR to produce GCM, and this series already
> > contains a GCM driver.
> > 
> > Do cores exist that implement PCLMULQDQ but not AES-NI?
> > 
> > If not, I think we should be able to drop this patch (and remove the
> > existing PCLMULQDQ GHASH driver as well)
> 
> AFAIK, dm-verity (authenticated but not encrypted file system) is one use
> case for authentication only.
> 
> Although I am not sure if GHASH is specifically used for this or SHA?
> 
> Also, I do not know of any cores that implement PCLMULQDQ and not AES-NI.
> 

dm-verity only uses unkeyed hash algorithms.  So no, it doesn't use GHASH.

- Eric


Re: [PATCH v4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-15 Thread Eric Snowberg


> On Jan 15, 2021, at 10:21 AM, James Bottomley 
>  wrote:
> 
> On Tue, 2020-09-15 at 20:49 -0400, Eric Snowberg wrote:
>> The Secure Boot Forbidden Signature Database, dbx, contains a list of
>> now revoked signatures and keys previously approved to boot with UEFI
>> Secure Boot enabled.  The dbx is capable of containing any number of
>> EFI_CERT_X509_SHA256_GUID, EFI_CERT_SHA256_GUID, and
>> EFI_CERT_X509_GUID entries.
>> 
>> Currently when EFI_CERT_X509_GUID are contained in the dbx, the
>> entries are skipped.
>> 
>> Add support for EFI_CERT_X509_GUID dbx entries. When a
>> EFI_CERT_X509_GUID is found, it is added as an asymmetrical key to
>> the .blacklist keyring. Anytime the .platform keyring is used, the
>> keys in the .blacklist keyring are referenced, if a matching key is
>> found, the key will be rejected.
>> 
>> Signed-off-by: Eric Snowberg 
> 
> If you're using shim, as most of our users are, you have no access to
> dbx to blacklist certificates.  Plus our security envelope includes the
> Mok variables, so you should also be paying attestion to MokListX (or
> it's RT equivalent: MokListXRT).
> 
> If you add this to the patch, we get something that is mechanistically
> complete and which also allows users to add certs to their Mok
> blacklist.

That make sense. I’ll work on a patch to add this ability.



Re: [PATCH v4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-15 Thread Eric Snowberg


> On Jan 15, 2021, at 2:15 AM, Jarkko Sakkinen  wrote:
> 
> On Wed, Jan 13, 2021 at 05:11:10PM -0700, Eric Snowberg wrote:
>> 
>>> On Jan 13, 2021, at 1:41 PM, Jarkko Sakkinen 
>>>  wrote:
>>> 
>>> On Tue, Jan 12, 2021 at 02:57:39PM +, David Howells wrote:
>>>> Eric Snowberg  wrote:
>>>> 
>>>>>> On Dec 10, 2020, at 2:49 AM, David Howells  wrote:
>>>>>> 
>>>>>> Eric Snowberg  wrote:
>>>>>> 
>>>>>>> Add support for EFI_CERT_X509_GUID dbx entries. When a 
>>>>>>> EFI_CERT_X509_GUID
>>>>>>> is found, it is added as an asymmetrical key to the .blacklist keyring.
>>>>>>> Anytime the .platform keyring is used, the keys in the .blacklist 
>>>>>>> keyring
>>>>>>> are referenced, if a matching key is found, the key will be rejected.
>>>>>> 
>>>>>> Ummm...  Why this way and not as a blacklist key which takes up less 
>>>>>> space?
>>>>>> I'm guessing that you're using the key chain matching logic.  We really 
>>>>>> only
>>>>>> need to blacklist the key IDs.
>>>>> 
>>>>> I implemented it this way so that certs in the dbx would only impact 
>>>>> the .platform keyring. I was under the impression we didn’t want to have 
>>>>> Secure Boot UEFI db/dbx certs dictate keyring functionality within the 
>>>>> kernel
>>>>> itself. Meaning if we have a matching dbx cert in any other keyring 
>>>>> (builtin,
>>>>> secondary, ima, etc.), it would be allowed. If that is not how you’d like 
>>>>> to 
>>>>> see it done, let me know and I’ll make the change.
>>>> 
>>>> I wonder if that is that the right thing to do.  I guess this is a policy
>>>> decision and may depend on the particular user.
>>> 
>>> Why would you want to allow dbx entry in any keyring?
>> 
>> Today, DB and MOK certs go into the platform keyring.  These certs are only
>> referenced during kexec.  They can’t be used for other things like validating
>> kernel module signatures.  If we follow the same pattern, the DBX and MOKX 
>> entries
>> in the blacklist keyring should only impact kexec. 
>> 
>> Currently, Mickaël Salaün has another outstanding series to allow root to 
>> update 
>> the blacklist keyring.  I assume the use case for this is around 
>> certificates used 
>> within the kernel, for example revoking kernel module signatures.  The 
>> question I have
>> is, should another keyring be introduced?  One that carries DBX and MOKX, 
>> which just
>> correspond to certs/hashes in the platform keyring; this keyring would only 
>> be
>> referenced for kexec, just like the platform keyring is today. Then, the 
>> current
>> blacklist keyring would be used for everything internal to the kernel.
> 
> Right, I'm following actively that series.
> 
> Why couldn't user space drive this process and use that feature to do it?

I could see where the user would want to use both. With Mickaël Salaün’s
series, the blacklist keyring is updated immediately.  However it does
not survive a reboot.  With my patch, the blacklist keyring is updated
during boot, based on what is in the dbx. Neither approach needs a new 
kernel build.



Re: [PATCH net] skbuff: back tiny skbs with kmalloc() in __netdev_alloc_skb() too

2021-01-15 Thread Eric Dumazet
On Fri, Jan 15, 2021 at 12:55 AM Alexander Lobakin  wrote:
>
> Commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for
> tiny skbs") ensured that skbs with data size lower than 1025 bytes
> will be kmalloc'ed to avoid excessive page cache fragmentation and
> memory consumption.
> However, the same issue can still be achieved manually via
> __netdev_alloc_skb(), where the check for size hasn't been changed.
> Mirror the condition from __napi_alloc_skb() to prevent from that.
>
> Fixes: 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny 
> skbs")

No, this tag is wrong, if you fix a bug, bug is much older than linux-5.11

My fix was about GRO head and virtio_net heads, both using pre-sized
small buffers.

You want to fix something else, and this is fine, because some drivers
are unfortunately
doing copy break ( at the cost of additional copy, even for packets
that might be consumed right away)


Re: cBPF socket filters failing - inexplicably?

2021-01-15 Thread Eric Dumazet
On Fri, Jan 15, 2021 at 7:52 AM Alexei Starovoitov
 wrote:
>
> Adding appropriate mailing list to cc...
>
> My wild guess is that as soon as socket got created:
> socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
> the packets were already queued to it.
> So later setsockopt() is too late to filter.
>
> Eric, thoughts?

Exactly, this is what happens.

I do not know how tcpdump and other programs deal with this.

Maybe by setting a small buffer size, or draining the queue.

>
> On Wed, Jan 6, 2021 at 6:55 AM Tom Cook  wrote:
> >
> > Another factoid to add to this:  I captured all traffic on an
> > interface while the test program was running using
> >
> > tcpdump -i wlo1 -w capture.pcap
> >
> > observing that multiple packets got through the filter.  I then built
> > the bpf_dbg program from the kernel source tree and ran the same
> > filter and capture file through it:
> >
> > $ tools/bpf_dbg
> > > load bpf 1,6 0 0 0
> > > load pcap capture.pcap
> > > run
> > bpf passes:0 fails:269288
> >
> > So bpf_dbg thinks the filter is correct; it's only when the filter is
> > attached to an actual socket that it fails occasionally.
> >
> > Regards,
> > Tom
> >
> > On Wed, Jan 6, 2021 at 10:07 AM Tom Cook  wrote:
> > >
> > > Just to note I have also reproduced this on a 5.10.0 kernel.
> > >
> > > On Tue, Jan 5, 2021 at 1:42 PM Tom Cook  wrote:
> > > >
> > > > In the course of tracking down a defect in some existing software,
> > > > I've found the failure demonstrated by the short program below.
> > > > Essentially, a cBPF program that just rejects every frame (ie always
> > > > returns zero) and is attached to a socket using setsockopt(SOL_SOCKET,
> > > > SO_ATTACH_FILTER, ...) still occasionally lets frames through to
> > > > userspace.
> > > >
> > > > The code is based on the first example in
> > > > Documentation/networking/filter.txt, except that I've changed the
> > > > content of the filter program and added a timeout on the socket.
> > > >
> > > > To reproduce the problem:
> > > >
> > > > # gcc test.c -o test
> > > > # sudo ./test
> > > > ... and in another console start a large network operation.
> > > >
> > > > In my case, I copied a ~300MB core file I had lying around to another
> > > > host on the LAN.  The test code should print the string "Failed to
> > > > read from socket" 100 times.  In practice, it produces about 10%
> > > > "Received packet with ethertype..." messages.
> > > >
> > > > I've observed the same result on Ubuntu amd64 glibc system running a
> > > > 5.9.0 kernel and also on Alpine arm64v8 muslc system running a 4.9.1
> > > > kernel.  I've written test code in both C and Python.  I'm fairly sure
> > > > this is not something I'm doing wrong - but very keen to have things
> > > > thrown at me if it is.
> > > >
> > > > Regards,
> > > > Tom Cook
> > > >
> > > >
> > > > #include 
> > > > #include 
> > > > #include 
> > > > #include 
> > > > #include 
> > > > #include 
> > > > #include 
> > > > #include 
> > > >
> > > > struct sock_filter code[] = {
> > > > { 0x06,0,0,0x00 }  /* BPF_RET | BPF_K   0   0   0 */
> > > > };
> > > >
> > > > struct sock_fprog bpf = {
> > > > .len = 1,
> > > > .filter = code,
> > > > };
> > > >
> > > > void test() {
> > > > uint8_t buf[2048];
> > > >
> > > > int sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
> > > > if (sock < 0) {
> > > > printf("Failed to open socket\n");
> > > > return;
> > > > }
> > > > int ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, , 
> > > > sizeof(bpf));
> > > > if (ret < 0) {
> > > > printf("Failed to set socket filter\n");
> > > > return;
> > > > }
> > > > struct timeval tv = {
> > > > .tv_sec = 1
> > > > };
> > > >
> > > > ret = setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, , sizeof(tv));
> > > > if (ret < 0) {
> > > > printf("Failed to set socket timeout\n");
> > > > return;
> > > > }
> > > >
> > > > ssize_t count = recv(sock, buf, 2048, 0);
> > > > if (count <= 0) {
> > > > printf("Failed to read from socket\n");
> > > > return;
> > > > }
> > > >
> > > > close(sock);
> > > >
> > > > uint16_t *ethertype = (short*)(buf + 12);
> > > > uint8_t *proto = (unsigned char *)(buf + 23);
> > > > uint16_t *dport = (uint16_t *)(buf + 14 + 20);
> > > >
> > > > printf("Received packet with ethertype 0x%04hu, protocol 0x%02hhu
> > > > and dport 0x%04hu\n", *ethertype, *proto, *dport);
> > > > }
> > > >
> > > > int main() {
> > > > for (size_t ii = 0; ii < 100; ++ii) {
> > > > test();
> > > > }
> > > > }


Re: [RFC v3 1/2] vfio/platform: add support for msi

2021-01-15 Thread Auger Eric
Hi Vikas,

On 1/15/21 7:26 AM, Vikas Gupta wrote:
> Hi Eric,
> 
> On Tue, Jan 12, 2021 at 2:30 PM Auger Eric  wrote:
>>
>> Hi Vikas,
>>
>> On 1/5/21 6:53 AM, Vikas Gupta wrote:
>>> On Tue, Dec 22, 2020 at 10:57 PM Auger Eric  wrote:
>>>>
>>>> Hi Vikas,
>>>>
>>>> On 12/14/20 6:45 PM, Vikas Gupta wrote:
>>>>> MSI support for platform devices.The MSI block
>>>>> is added as an extended IRQ which exports caps
>>>>> VFIO_IRQ_INFO_CAP_TYPE and VFIO_IRQ_INFO_CAP_MSI_DESCS.
>>>>>
>>>>> Signed-off-by: Vikas Gupta 
>>>>> ---
>>>>>  drivers/vfio/platform/vfio_platform_common.c  | 179 +++-
>>>>>  drivers/vfio/platform/vfio_platform_irq.c | 260 +-
>>>>>  drivers/vfio/platform/vfio_platform_private.h |  32 +++
>>>>>  include/uapi/linux/vfio.h |  44 +++
>>>>>  4 files changed, 496 insertions(+), 19 deletions(-)
>>>>>
>>>>> diff --git a/drivers/vfio/platform/vfio_platform_common.c 
>>>>> b/drivers/vfio/platform/vfio_platform_common.c
>>>>> index fb4b385191f2..c936852f35d7 100644
>>>>> --- a/drivers/vfio/platform/vfio_platform_common.c
>>>>> +++ b/drivers/vfio/platform/vfio_platform_common.c
>>>>> @@ -16,6 +16,7 @@
>>>>>  #include 
>>>>>  #include 
>>>>>  #include 
>>>>> +#include 
>>>>>
>>>>>  #include "vfio_platform_private.h"
>>>>>
>>>>> @@ -26,6 +27,8 @@
>>>>>  #define VFIO_PLATFORM_IS_ACPI(vdev) ((vdev)->acpihid != NULL)
>>>>>
>>>>>  static LIST_HEAD(reset_list);
>>>>> +/* devices having MSI support */
>>>> nit: for devices using MSIs?
>>>>> +static LIST_HEAD(msi_list);
>>>>>  static DEFINE_MUTEX(driver_lock);
>>>>>
>>>>>  static vfio_platform_reset_fn_t vfio_platform_lookup_reset(const char 
>>>>> *compat,
>>>>> @@ -47,6 +50,25 @@ static vfio_platform_reset_fn_t 
>>>>> vfio_platform_lookup_reset(const char *compat,
>>>>>   return reset_fn;
>>>>>  }
>>>>>
>>>>> +static bool vfio_platform_lookup_msi(struct vfio_platform_device *vdev)
>>>>> +{
>>>>> + bool has_msi = false;
>>>>> + struct vfio_platform_msi_node *iter;
>>>>> +
>>>>> + mutex_lock(_lock);
>>>>> + list_for_each_entry(iter, _list, link) {
>>>>> + if (!strcmp(iter->compat, vdev->compat) &&
>>>>> + try_module_get(iter->owner)) {
>>>>> + vdev->msi_module = iter->owner;
>>>>> + vdev->of_get_msi = iter->of_get_msi;
>>>>> + has_msi = true;
>>>>> + break;
>>>>> + }
>>>>> + }
>>>>> + mutex_unlock(_lock);
>>>>> + return has_msi;
>>>>> +}
>>>>> +
>>>>>  static int vfio_platform_acpi_probe(struct vfio_platform_device *vdev,
>>>>>   struct device *dev)
>>>>>  {
>>>>> @@ -126,6 +148,19 @@ static int vfio_platform_get_reset(struct 
>>>>> vfio_platform_device *vdev)
>>>>>   return vdev->of_reset ? 0 : -ENOENT;
>>>>>  }
>>>>>
>>>>> +static int vfio_platform_get_msi(struct vfio_platform_device *vdev)
>>>>> +{
>>>>> + bool has_msi;
>>>>> +
>>>>> + has_msi = vfio_platform_lookup_msi(vdev);
>>>>> + if (!has_msi) {
>>>>> + request_module("vfio-msi:%s", vdev->compat);
>>>>> + has_msi = vfio_platform_lookup_msi(vdev);
>>>>> + }
>>>>> +
>>>>> + return has_msi ? 0 : -ENOENT;
>>>>> +}
>>>>> +
>>>>>  static void vfio_platform_put_reset(struct vfio_platform_device *vdev)
>>>>>  {
>>>>>   if (VFIO_PLATFORM_IS_ACPI(vdev))
>>>>> @@ -135,6 +170,12 @@ static void vfio_platform_put_reset(struct 
>>>>> vfio_platform_device *vdev)
>>>>>   module_put(vdev->reset_module);
>

Re: [RFC v3 2/2] vfio/platform: msi: add Broadcom platform devices

2021-01-15 Thread Auger Eric
Hi Vikas,
On 1/15/21 7:35 AM, Vikas Gupta wrote:
> Hi Eric,
> 
> On Tue, Jan 12, 2021 at 2:52 PM Auger Eric  wrote:
>>
>> Hi Vikas,
>>
>> On 12/14/20 6:45 PM, Vikas Gupta wrote:
>>> Add msi support for Broadcom platform devices
>>>
>>> Signed-off-by: Vikas Gupta 
>>> ---
>>>  drivers/vfio/platform/Kconfig |  1 +
>>>  drivers/vfio/platform/Makefile|  1 +
>>>  drivers/vfio/platform/msi/Kconfig |  9 
>>>  drivers/vfio/platform/msi/Makefile|  2 +
>>>  .../vfio/platform/msi/vfio_platform_bcmplt.c  | 49 +++
>>>  5 files changed, 62 insertions(+)
>>>  create mode 100644 drivers/vfio/platform/msi/Kconfig
>>>  create mode 100644 drivers/vfio/platform/msi/Makefile
>>>  create mode 100644 drivers/vfio/platform/msi/vfio_platform_bcmplt.c
>> what does plt mean?
> This(plt) is a generic name for Broadcom platform devices, which we`ll
>  plan to add in this file. Currently we have only one in this file.
> Do you think this name does not sound good here?

we have VFIO_PLATFORM_BCMFLEXRM_RESET config which also applied to vfio
flex-rm platform device.

I think it would be more homegenous to have VFIO_PLATFORM_BCMFLEXRM_MSI
in case we keep a separate msi module.

also in reset dir we have vfio_platform_bcmflexrm.c


>>>
>>> diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
>>> index dc1a3c44f2c6..7b8696febe61 100644
>>> --- a/drivers/vfio/platform/Kconfig
>>> +++ b/drivers/vfio/platform/Kconfig
>>> @@ -21,3 +21,4 @@ config VFIO_AMBA
>>> If you don't know what to do here, say N.
>>>
>>>  source "drivers/vfio/platform/reset/Kconfig"
>>> +source "drivers/vfio/platform/msi/Kconfig"
>>> diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
>>> index 3f3a24e7c4ef..9ccdcdbf0e7e 100644
>>> --- a/drivers/vfio/platform/Makefile
>>> +++ b/drivers/vfio/platform/Makefile
>>> @@ -5,6 +5,7 @@ vfio-platform-y := vfio_platform.o
>>>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
>>>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform-base.o
>>>  obj-$(CONFIG_VFIO_PLATFORM) += reset/
>>> +obj-$(CONFIG_VFIO_PLATFORM) += msi/
>>>
>>>  vfio-amba-y := vfio_amba.o
>>>
>>> diff --git a/drivers/vfio/platform/msi/Kconfig 
>>> b/drivers/vfio/platform/msi/Kconfig
>>> new file mode 100644
>>> index ..54d6b70e1e32
>>> --- /dev/null
>>> +++ b/drivers/vfio/platform/msi/Kconfig
>>> @@ -0,0 +1,9 @@
>>> +# SPDX-License-Identifier: GPL-2.0-only
>>> +config VFIO_PLATFORM_BCMPLT_MSI
>>> + tristate "MSI support for Broadcom platform devices"
>>> + depends on VFIO_PLATFORM && (ARCH_BCM_IPROC || COMPILE_TEST)
>>> + default ARCH_BCM_IPROC
>>> + help
>>> +   Enables the VFIO platform driver to handle msi for Broadcom devices
>>> +
>>> +   If you don't know what to do here, say N.
>>> diff --git a/drivers/vfio/platform/msi/Makefile 
>>> b/drivers/vfio/platform/msi/Makefile
>>> new file mode 100644
>>> index ..27422d45cecb
>>> --- /dev/null
>>> +++ b/drivers/vfio/platform/msi/Makefile
>>> @@ -0,0 +1,2 @@
>>> +# SPDX-License-Identifier: GPL-2.0
>>> +obj-$(CONFIG_VFIO_PLATFORM_BCMPLT_MSI) += vfio_platform_bcmplt.o
>>> diff --git a/drivers/vfio/platform/msi/vfio_platform_bcmplt.c 
>>> b/drivers/vfio/platform/msi/vfio_platform_bcmplt.c
>>> new file mode 100644
>>> index ..a074b5e92d77
>>> --- /dev/null
>>> +++ b/drivers/vfio/platform/msi/vfio_platform_bcmplt.c
>>> @@ -0,0 +1,49 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +/*
>>> + * Copyright 2020 Broadcom.
>>> + */
>>> +
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +
>>> +#include "../vfio_platform_private.h"
>>> +
>>> +#define RING_SIZE(64 << 10)
>>> +
>>> +#define RING_MSI_ADDR_LS 0x03c
>>> +#define RING_MSI_ADDR_MS 0x040
>>> +#define RING_MSI_DATA_VALUE  0x064
>> Those 3 defines would not be needed anymore with that implementation option.
>>> +
>>> +static u32 bcm_num_msi(struct vfio_platform_device *vdev)
>>> +{
>>> + struct vfio_platform_region *reg = >regions[0];
>>

Re: [PATCH v13 07/15] iommu/smmuv3: Allow stage 1 invalidation with unmanaged ASIDs

2021-01-14 Thread Auger Eric
Hi Jean,

On 1/14/21 6:33 PM, Jean-Philippe Brucker wrote:
> Hi Eric,
> 
> On Thu, Jan 14, 2021 at 05:58:27PM +0100, Auger Eric wrote:
>>>>  The uacce-devel branches from
>>>>> https://github.com/Linaro/linux-kernel-uadk do provide this at the moment
>>>>> (they track the latest sva/zip-devel branch
>>>>> https://jpbrucker.net/git/linux/ which is roughly based on mainline.)
>> As I plan to respin shortly, please could you confirm the best branch to
>> rebase on still is that one (uacce-devel from the linux-kernel-uadk git
>> repo). Is it up to date? Commits seem to be quite old there.
> 
> Right I meant the uacce-devel-X branches. The uacce-devel-5.11 branch
> currently has the latest patches

OK thanks!

Eric
> 
> Thanks,
> Jean
> 



Re: [PATCH v13 07/15] iommu/smmuv3: Allow stage 1 invalidation with unmanaged ASIDs

2021-01-14 Thread Auger Eric
Hi Shameer, Jean-Philippe,

On 12/4/20 11:23 AM, Auger Eric wrote:
> Hi Shameer, Jean-Philippe,
> 
> On 12/4/20 11:20 AM, Shameerali Kolothum Thodi wrote:
>> Hi Jean,
>>
>>> -Original Message-
>>> From: Jean-Philippe Brucker [mailto:jean-phili...@linaro.org]
>>> Sent: 04 December 2020 09:54
>>> To: Shameerali Kolothum Thodi 
>>> Cc: Auger Eric ; wangxingang
>>> ; Xieyingtai ;
>>> k...@vger.kernel.org; m...@kernel.org; j...@8bytes.org; w...@kernel.org;
>>> io...@lists.linux-foundation.org; linux-kernel@vger.kernel.org;
>>> vivek.gau...@arm.com; alex.william...@redhat.com;
>>> zhangfei@linaro.org; robin.mur...@arm.com;
>>> kvm...@lists.cs.columbia.edu; eric.auger@gmail.com; Zengtao (B)
>>> ; qubingbing 
>>> Subject: Re: [PATCH v13 07/15] iommu/smmuv3: Allow stage 1 invalidation with
>>> unmanaged ASIDs
>>>
>>> Hi Shameer,
>>>
>>> On Thu, Dec 03, 2020 at 06:42:57PM +, Shameerali Kolothum Thodi wrote:
>>>> Hi Jean/zhangfei,
>>>> Is it possible to have a branch with minimum required SVA/UACCE related
>>> patches
>>>> that are already public and can be a "stable" candidate for future respin 
>>>> of
>>> Eric's series?
>>>> Please share your thoughts.
>>>
>>> By "stable" you mean a fixed branch with the latest SVA/UACCE patches
>>> based on mainline? 
>>
>> Yes. 
>>
>>  The uacce-devel branches from
>>> https://github.com/Linaro/linux-kernel-uadk do provide this at the moment
>>> (they track the latest sva/zip-devel branch
>>> https://jpbrucker.net/git/linux/ which is roughly based on mainline.)
As I plan to respin shortly, please could you confirm the best branch to
rebase on still is that one (uacce-devel from the linux-kernel-uadk git
repo). Is it up to date? Commits seem to be quite old there.

Thanks

Eric
>>
>> Thanks. 
>>
>> Hi Eric,
>>
>> Could you please take a look at the above branches and see whether it make 
>> sense
>> to rebase on top of either of those?
>>
>> From vSVA point of view, it will be less rebase hassle if we can do that.
> 
> Sure. I will rebase on top of this ;-)
> 
> Thanks
> 
> Eric
>>
>> Thanks,
>> Shameer
>>
>>> Thanks,
>>> Jean
>>
> 
> ___
> iommu mailing list
> io...@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
> 



[PATCH v2 5/9] KVM: arm: move has_run_once after the map_resources

2021-01-14 Thread Eric Auger
has_run_once is set to true at the beginning of
kvm_vcpu_first_run_init(). This generally is not an issue
except when exercising the code with KVM selftests. for instance,
if kvm_vgic_map_resources() fails due to erroneous user settings,
has_run_once is set and this prevents from continuing
executing the test. This patch moves the assignment after the
kvm_vgic_map_resources().

Signed-off-by: Eric Auger 

---

v1 -> v2:
- slight reword of the commit msg (for instance)
---
 arch/arm64/kvm/arm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 04c44853b103..580760e58980 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -573,8 +573,6 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
if (!kvm_arm_vcpu_is_finalized(vcpu))
return -EPERM;
 
-   vcpu->arch.has_run_once = true;
-
if (likely(irqchip_in_kernel(kvm))) {
/*
 * Map the VGIC hardware resources before running a vcpu the
@@ -591,6 +589,8 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
static_branch_inc(_irqchip_in_use);
}
 
+   vcpu->arch.has_run_once = true;
+
ret = kvm_timer_enable(vcpu);
if (ret)
return ret;
-- 
2.21.3



[PATCH v2 8/9] KVM: arm64: vgic-v3: Expose GICR_TYPER.Last for userspace

2021-01-14 Thread Eric Auger
Commit 23bde34771f1 ("KVM: arm64: vgic-v3: Drop the
reporting of GICR_TYPER.Last for userspace") temporarily fixed
a bug identified when attempting to access the GICR_TYPER
register before the redistributor region setting but dropped
the support of the LAST bit. Emulating the GICR_TYPER.Last bit
still makes sense for architecture compliance.

This patch restores its support (if the redistributor region
was set) while keeping the code safe.

Signed-off-by: Eric Auger 
---
 arch/arm64/kvm/vgic/vgic-mmio-v3.c | 7 ++-
 include/kvm/arm_vgic.h | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 987e366c8000..7ff23c153128 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -277,6 +277,8 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct 
kvm_vcpu *vcpu,
 gpa_t addr, unsigned int len)
 {
unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+   struct vgic_cpu *vgic_cpu = >arch.vgic_cpu;
+   struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
int target_vcpu_id = vcpu->vcpu_id;
u64 value;
 
@@ -286,7 +288,9 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct 
kvm_vcpu *vcpu,
if (vgic_has_its(vcpu->kvm))
value |= GICR_TYPER_PLPIS;
 
-   /* reporting of the Last bit is not supported for userspace */
+   if (rdreg && (vgic_cpu->rdreg_index == (rdreg->free_index - 1)))
+   value |= GICR_TYPER_LAST;
+
return extract_bytes(value, addr & 7, len);
 }
 
@@ -714,6 +718,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
return -EINVAL;
 
vgic_cpu->rdreg = rdreg;
+   vgic_cpu->rdreg_index = rdreg->free_index;
 
rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 3d74f1060bd1..ec621180ef09 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -322,6 +322,7 @@ struct vgic_cpu {
 */
struct vgic_io_device   rd_iodev;
struct vgic_redist_region *rdreg;
+   u32 rdreg_index;
 
/* Contains the attributes and gpa of the LPI pending tables. */
u64 pendbaser;
-- 
2.21.3



[PATCH v2 9/9] KVM: selftests: aarch64/vgic-v3 init sequence tests

2021-01-14 Thread Eric Auger
The tests exercise the VGIC_V3 device creation including the
associated KVM_DEV_ARM_VGIC_GRP_ADDR group attributes:

- KVM_VGIC_V3_ADDR_TYPE_DIST/REDIST (vcpu_first and vgic_first)
- KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION (redist_regions).

Another test dedicates to KVM_DEV_ARM_VGIC_GRP_REDIST_REGS group
and especially the GICR_TYPER read. The goal was to test the case
recently fixed by commit 23bde34771f1
("KVM: arm64: vgic-v3: Drop the reporting of GICR_TYPER.Last for userspace").

The API under test can be found at
Documentation/virt/kvm/devices/arm-vgic-v3.rst

Signed-off-by: Eric Auger 
---
 tools/testing/selftests/kvm/Makefile  |   1 +
 .../testing/selftests/kvm/aarch64/vgic_init.c | 453 ++
 .../testing/selftests/kvm/include/kvm_util.h  |   5 +
 tools/testing/selftests/kvm/lib/kvm_util.c|  51 ++
 4 files changed, 510 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/aarch64/vgic_init.c

diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index fe41c6a0fa67..c3c2cb6fe8a1 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -68,6 +68,7 @@ TEST_GEN_PROGS_x86_64 += steal_time
 
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
+TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
 TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c 
b/tools/testing/selftests/kvm/aarch64/vgic_init.c
new file mode 100644
index ..e8caa64c0395
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vgic init sequence tests
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define NR_VCPUS   4
+
+#define REDIST_REGION_ATTR_ADDR(count, base, flags, index) (((uint64_t)(count) 
<< 52) | \
+   ((uint64_t)((base) >> 16) << 16) | ((uint64_t)(flags) << 12) | index)
+#define REG_OFFSET(vcpu, offset) (((uint64_t)vcpu << 32) | offset)
+
+#define GICR_TYPER 0x8
+
+static int access_redist_reg(int gicv3_fd, int vcpu, int offset,
+uint32_t *val, bool write)
+{
+   uint64_t attr = REG_OFFSET(vcpu, offset);
+
+   return kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS,
+attr, val, write);
+}
+
+static void guest_code(int cpu)
+{
+   GUEST_SYNC(0);
+   GUEST_SYNC(1);
+   GUEST_SYNC(2);
+   GUEST_DONE();
+}
+
+static int run_vcpu(struct kvm_vm *vm, uint32_t vcpuid)
+{
+   static int run;
+   struct ucall uc;
+   int ret;
+
+   vcpu_args_set(vm, vcpuid, 1, vcpuid);
+   ret = _vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+   get_ucall(vm, vcpuid, );
+   run++;
+
+   if (ret)
+   return -errno;
+   return 0;
+}
+
+int dist_rdist_tests(struct kvm_vm *vm)
+{
+   int ret, gicv3_fd, max_ipa_bits;
+   uint64_t addr;
+
+   max_ipa_bits = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+   ret = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, true);
+   if (ret) {
+   print_skip("GICv3 not supported");
+   exit(KSFT_SKIP);
+   }
+
+   ret = kvm_create_device(vm, 0, true);
+   TEST_ASSERT(ret == -ENODEV, "unsupported device");
+
+   /* Create the device */
+
+   gicv3_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+   TEST_ASSERT(gicv3_fd > 0, "GICv3 device created");
+
+   /* Check attributes */
+
+   ret = kvm_device_check_attr(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+   KVM_VGIC_V3_ADDR_TYPE_DIST);
+   TEST_ASSERT(!ret, "KVM_DEV_ARM_VGIC_GRP_ADDR/KVM_VGIC_V3_ADDR_TYPE_DIST 
supported");
+
+   ret = kvm_device_check_attr(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+   KVM_VGIC_V3_ADDR_TYPE_REDIST);
+   TEST_ASSERT(!ret, 
"KVM_DEV_ARM_VGIC_GRP_ADDR/KVM_VGIC_V3_ADDR_TYPE_REDIST supported");
+
+   ret = kvm_device_check_attr(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 0);
+   TEST_ASSERT(ret == -ENXIO, "attribute not supported");
+
+   /* misaligned DIST and REDIST addresses */
+
+   addr = 0x1000;
+   ret = kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+   KVM_VGIC_V3_ADDR_TYPE_DIST, , true);
+   TEST_ASSERT(ret == -EINVAL, "GICv3 dist base not 64kB aligned");
+
+   ret = kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+   KVM_VGIC_V3_ADDR_TYPE_REDIST, , true);
+   TEST_ASSERT(ret == -EINVAL, &quo

[PATCH v2 7/9] KVM: arm64: Simplify argument passing to vgic_uaccess_[read|write]

2021-01-14 Thread Eric Auger
vgic_uaccess() takes a struct vgic_io_device argument, converts it
to a struct kvm_io_device and passes it to the read/write accessor
functions, which convert it back to a struct vgic_io_device.
Avoid the indirection by passing the struct vgic_io_device argument
directly to vgic_uaccess_{read,write}.

Signed-off-by: Eric Auger 

---

v1 -> v2:
- reworded the commit message as suggested by Alexandru
---
 arch/arm64/kvm/vgic/vgic-mmio.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index b2d73fc0d1ef..48c6067fc5ec 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -938,10 +938,9 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct 
vgic_io_device *iodev,
return region;
 }
 
-static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device 
*iodev,
 gpa_t addr, u32 *val)
 {
-   struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
const struct vgic_register_region *region;
struct kvm_vcpu *r_vcpu;
 
@@ -960,10 +959,9 @@ static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct 
kvm_io_device *dev,
return 0;
 }
 
-static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device 
*iodev,
  gpa_t addr, const u32 *val)
 {
-   struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
const struct vgic_register_region *region;
struct kvm_vcpu *r_vcpu;
 
@@ -986,9 +984,9 @@ int vgic_uaccess(struct kvm_vcpu *vcpu, struct 
vgic_io_device *dev,
 bool is_write, int offset, u32 *val)
 {
if (is_write)
-   return vgic_uaccess_write(vcpu, >dev, offset, val);
+   return vgic_uaccess_write(vcpu, dev, offset, val);
else
-   return vgic_uaccess_read(vcpu, >dev, offset, val);
+   return vgic_uaccess_read(vcpu, dev, offset, val);
 }
 
 static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-- 
2.21.3



[PATCH v2 6/9] docs: kvm: devices/arm-vgic-v3: enhance KVM_DEV_ARM_VGIC_CTRL_INIT doc

2021-01-14 Thread Eric Auger
kvm_arch_vcpu_precreate() returns -EBUSY if the vgic is
already initialized. So let's document that KVM_DEV_ARM_VGIC_CTRL_INIT
must be called after all vcpu creations.

Signed-off-by: Eric Auger 

---

v1 -> v2:
- Must be called after all vcpu creations ->
  Must be called after all VCPUs have been created as per
  Alexandru's suggestion
---
 Documentation/virt/kvm/devices/arm-vgic-v3.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.rst 
b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
index 5dd3bff51978..51e5e5762571 100644
--- a/Documentation/virt/kvm/devices/arm-vgic-v3.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
@@ -228,7 +228,7 @@ Groups:
 
 KVM_DEV_ARM_VGIC_CTRL_INIT
   request the initialization of the VGIC, no additional parameter in
-  kvm_device_attr.addr.
+  kvm_device_attr.addr. Must be called after all VCPUs have been created.
 KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES
   save all LPI pending bits into guest RAM pending tables.
 
-- 
2.21.3



[PATCH v2 3/9] KVM: arm64: vgic-v3: Fix error handling in vgic_v3_set_redist_base()

2021-01-14 Thread Eric Auger
vgic_v3_insert_redist_region() may succeed while
vgic_register_all_redist_iodevs fails. For example this happens
while adding a redistributor region overlapping a dist region. The
failure only is detected on vgic_register_all_redist_iodevs when
vgic_v3_check_base() gets called in vgic_register_redist_iodev().

In such a case, remove the newly added redistributor region and free
it.

Signed-off-by: Eric Auger 

---

v1 -> v2:
- fix the commit message and split declaration/assignment of rdreg
---
 arch/arm64/kvm/vgic/vgic-mmio-v3.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 013b737b658f..987e366c8000 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -860,8 +860,14 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, 
u64 addr, u32 count)
 * afterwards will register the iodevs when needed.
 */
ret = vgic_register_all_redist_iodevs(kvm);
-   if (ret)
+   if (ret) {
+   struct vgic_redist_region *rdreg;
+
+   rdreg = vgic_v3_rdist_region_from_index(kvm, index);
+   list_del(>list);
+   kfree(rdreg);
return ret;
+   }
 
return 0;
 }
-- 
2.21.3



[PATCH v2 1/9] KVM: arm64: vgic-v3: Fix some error codes when setting RDIST base

2021-01-14 Thread Eric Auger
KVM_DEV_ARM_VGIC_GRP_ADDR group doc says we should return
-EEXIST in case the base address of the redist is already set.
We currently return -EINVAL.

However we need to return -EINVAL in case a legacy REDIST address
is attempted to be set while REDIST_REGIONS were set. This case
is discriminated by looking at the count field.

Signed-off-by: Eric Auger 

---

v1 -> v2:
- simplify the check sequence
---
 arch/arm64/kvm/vgic/vgic-mmio-v3.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 15a6c98ee92f..013b737b658f 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -791,10 +791,6 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, 
uint32_t index,
size_t size = count * KVM_VGIC_V3_REDIST_SIZE;
int ret;
 
-   /* single rdist region already set ?*/
-   if (!count && !list_empty(rd_regions))
-   return -EINVAL;
-
/* cross the end of memory ? */
if (base + size < base)
return -EINVAL;
@@ -805,11 +801,14 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, 
uint32_t index,
} else {
rdreg = list_last_entry(rd_regions,
struct vgic_redist_region, list);
-   if (index != rdreg->index + 1)
-   return -EINVAL;
 
-   /* Cannot add an explicitly sized regions after legacy region */
-   if (!rdreg->count)
+   if ((!count) != (!rdreg->count))
+   return -EINVAL; /* Mix REDIST and REDIST_REGION */
+
+   if (!count)
+   return -EEXIST;
+
+   if (index != rdreg->index + 1)
return -EINVAL;
}
 
-- 
2.21.3



[PATCH v2 2/9] KVM: arm64: Fix KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION read

2021-01-14 Thread Eric Auger
The doc says:
"The characteristics of a specific redistributor region can
 be read by presetting the index field in the attr data.
 Only valid for KVM_DEV_TYPE_ARM_VGIC_V3"

Unfortunately the existing code fails to read the input attr data.

Fixes: 04c110932225 ("KVM: arm/arm64: Implement 
KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION")
Cc: sta...@vger.kernel.org#v4.17+
Signed-off-by: Eric Auger 
Reviewed-by: Alexandru Elisei 

---

v1 -> v2:
- in the commit message, remove the statement that the index always is 0
- add Alexandru's R-b
---
 arch/arm64/kvm/vgic/vgic-kvm-device.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c 
b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index 44419679f91a..2f66cf247282 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -226,6 +226,9 @@ static int vgic_get_common_attr(struct kvm_device *dev,
u64 addr;
unsigned long type = (unsigned long)attr->attr;
 
+   if (copy_from_user(, uaddr, sizeof(addr)))
+   return -EFAULT;
+
r = kvm_vgic_addr(dev->kvm, type, , false);
if (r)
return (r == -ENODEV) ? -ENXIO : r;
-- 
2.21.3



[PATCH v2 4/9] KVM: arm/arm64: vgic: Reset base address on kvm_vgic_dist_destroy()

2021-01-14 Thread Eric Auger
On vgic_dist_destroy(), the addresses are not reset. However for
kvm selftest purpose this would allow to continue the test execution
even after a failure when running KVM_RUN. So let's reset the
base addresses.

Signed-off-by: Eric Auger 

---

v1 -> v2:
- use dist-> in the else and add braces
---
 arch/arm64/kvm/vgic/vgic-init.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 052917deb149..cf6faa0aeddb 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -335,13 +335,16 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
kfree(dist->spis);
dist->spis = NULL;
dist->nr_spis = 0;
+   dist->vgic_dist_base = VGIC_ADDR_UNDEF;
 
-   if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+   if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
list_for_each_entry_safe(rdreg, next, >rd_regions, list) {
list_del(>list);
kfree(rdreg);
}
INIT_LIST_HEAD(>rd_regions);
+   } else {
+   dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
}
 
if (vgic_has_its(kvm))
@@ -362,6 +365,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
vgic_flush_pending_lpis(vcpu);
 
INIT_LIST_HEAD(_cpu->ap_list_head);
+   vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
 }
 
 /* To be called with kvm->lock held */
-- 
2.21.3



[PATCH v2 0/9] KVM/ARM: Some vgic fixes and init sequence KVM selftests

2021-01-14 Thread Eric Auger
While writting vgic v3 init sequence KVM selftests I noticed some
relatively minor issues. This was also the opportunity to try to
fix the issue laterly reported by Zenghui, related to the RDIST_TYPER
last bit emulation. The final patch is a first batch of VGIC init
sequence selftests. Of course they can be augmented with a lot more
register access tests, but let's try to move forward incrementally ...

Best Regards

Eric

This series can be found at:
https://github.com/eauger/linux/tree/vgic_kvmselftests_v2

History:
- Took into account all comments from Marc and Alexandru's except
  the has_run_once still after the map_resources (this would oblige
  me to revisit in depth the selftests)

Eric Auger (9):
  KVM: arm64: vgic-v3: Fix some error codes when setting RDIST base
  KVM: arm64: Fix KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION read
  KVM: arm64: vgic-v3: Fix error handling in vgic_v3_set_redist_base()
  KVM: arm/arm64: vgic: Reset base address on kvm_vgic_dist_destroy()
  KVM: arm: move has_run_once after the map_resources
  docs: kvm: devices/arm-vgic-v3: enhance KVM_DEV_ARM_VGIC_CTRL_INIT doc
  KVM: arm64: Simplify argument passing to vgic_uaccess_[read|write]
  KVM: arm64: vgic-v3: Expose GICR_TYPER.Last for userspace
  KVM: selftests: aarch64/vgic-v3 init sequence tests

 .../virt/kvm/devices/arm-vgic-v3.rst  |   2 +-
 arch/arm64/kvm/arm.c  |   4 +-
 arch/arm64/kvm/vgic/vgic-init.c   |   6 +-
 arch/arm64/kvm/vgic/vgic-kvm-device.c |   3 +
 arch/arm64/kvm/vgic/vgic-mmio-v3.c|  30 +-
 arch/arm64/kvm/vgic/vgic-mmio.c   |  10 +-
 include/kvm/arm_vgic.h|   1 +
 tools/testing/selftests/kvm/Makefile  |   1 +
 .../testing/selftests/kvm/aarch64/vgic_init.c | 453 ++
 .../testing/selftests/kvm/include/kvm_util.h  |   5 +
 tools/testing/selftests/kvm/lib/kvm_util.c|  51 ++
 11 files changed, 546 insertions(+), 20 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/aarch64/vgic_init.c

-- 
2.21.3



Re: [PATCH 8/9] KVM: arm64: vgic-v3: Expose GICR_TYPER.Last for userspace

2021-01-14 Thread Auger Eric
Hi Alexandru,

On 1/12/21 6:02 PM, Alexandru Elisei wrote:
> Hi Eric,
> 
> On 12/12/20 6:50 PM, Eric Auger wrote:
>> Commit 23bde34771f1 ("KVM: arm64: vgic-v3: Drop the
>> reporting of GICR_TYPER.Last for userspace") temporarily fixed
>> a bug identified when attempting to access the GICR_TYPER
>> register before the redistributor region setting but dropped
>> the support of the LAST bit. This patch restores its
>> support (if the redistributor region was set) while keeping the
>> code safe.
> 
> I suppose the reason for emulating GICR_TYPER.Last is for architecture 
> compliance,
> right? I think that should be in the commit message.
OK added this in the commit msg.
> 
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  arch/arm64/kvm/vgic/vgic-mmio-v3.c | 7 ++-
>>  include/kvm/arm_vgic.h | 1 +
>>  2 files changed, 7 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
>> b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> index 581f0f49..2f9ef6058f6e 100644
>> --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> @@ -277,6 +277,8 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct 
>> kvm_vcpu *vcpu,
>>   gpa_t addr, unsigned int len)
>>  {
>>  unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
>> +struct vgic_cpu *vgic_cpu = >arch.vgic_cpu;
>> +struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
>>  int target_vcpu_id = vcpu->vcpu_id;
>>  u64 value;
>>  
>> @@ -286,7 +288,9 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct 
>> kvm_vcpu *vcpu,
>>  if (vgic_has_its(vcpu->kvm))
>>  value |= GICR_TYPER_PLPIS;
>>  
>> -/* reporting of the Last bit is not supported for userspace */
>> +if (rdreg && (vgic_cpu->rdreg_index == (rdreg->free_index - 1)))
>> +value |= GICR_TYPER_LAST;
>> +
>>  return extract_bytes(value, addr & 7, len);
>>  }
>>  
>> @@ -714,6 +718,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
>>  return -EINVAL;
>>  
>>  vgic_cpu->rdreg = rdreg;
>> +vgic_cpu->rdreg_index = rdreg->free_index;
> 
> What happens if the next redistributor region we register has the base address
> adjacent to this one?
> 
> I'm really not familiar with the code, but is it not possible to create two
> Redistributor regions (via
> KVM_DEV_ARM_VGIC_GRP_ADDR(KVM_VGIC_V3_ADDR_TYPE_REDIST)) where the second
> Redistributor region start address is immediately after the last 
> Redistributor in
> the preceding region?
KVM_VGIC_V3_ADDR_TYPE_REDIST only allows to create a single rdist
region. Only KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION allows to register
several of them.

with KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, it is possible to register
adjacent rdist regions. vgic_v3_rdist_free_slot() previously returned
the 1st rdist region where enough space remains for inserting the new
reg. We put the rdist at the free index there.

But maybe I misunderstood your question?

Thanks

Eric
> 
> Thanks,
> Alex
>>  
>>  rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
>>  
>> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
>> index a8d8fdcd3723..596c069263a7 100644
>> --- a/include/kvm/arm_vgic.h
>> +++ b/include/kvm/arm_vgic.h
>> @@ -322,6 +322,7 @@ struct vgic_cpu {
>>   */
>>  struct vgic_io_device   rd_iodev;
>>  struct vgic_redist_region *rdreg;
>> +u32 rdreg_index;
>>  
>>  /* Contains the attributes and gpa of the LPI pending tables. */
>>  u64 pendbaser;
> 



Re: [PATCH 5/9] KVM: arm: move has_run_once after the map_resources

2021-01-14 Thread Auger Eric
Hi Alexandru,

On 1/12/21 3:55 PM, Alexandru Elisei wrote:
> Hi Eric,
> 
> On 12/12/20 6:50 PM, Eric Auger wrote:
>> has_run_once is set to true at the beginning of
>> kvm_vcpu_first_run_init(). This generally is not an issue
>> except when exercising the code with KVM selftests. Indeed,
>> if kvm_vgic_map_resources() fails due to erroneous user settings,
>> has_run_once is set and this prevents from continuing
>> executing the test. This patch moves the assignment after the
>> kvm_vgic_map_resources().
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  arch/arm64/kvm/arm.c | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
>> index c0ffb019ca8b..331fae6bff31 100644
>> --- a/arch/arm64/kvm/arm.c
>> +++ b/arch/arm64/kvm/arm.c
>> @@ -540,8 +540,6 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
>>  if (!kvm_arm_vcpu_is_finalized(vcpu))
>>  return -EPERM;
>>  
>> -vcpu->arch.has_run_once = true;
>> -
>>  if (likely(irqchip_in_kernel(kvm))) {
>>  /*
>>   * Map the VGIC hardware resources before running a vcpu the
>> @@ -560,6 +558,8 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
>>  static_branch_inc(_irqchip_in_use);
>>  }
>>  
>> +vcpu->arch.has_run_once = true;
> 
> I have a few concerns regarding this:
> 
> 1. Moving has_run_once = true here seems very arbitrary to me - 
> kvm_timer_enable()
> and kvm_arm_pmu_v3_enable(), below it, can both fail because of erroneous user
> values. If there's a reason why the assignment cannot be moved at the end of 
> the
> function, I think it should be clearly stated in a comment for the people who
> might be tempted to write similar tests for the timer or pmu.

Setting has_run_once = true at the entry of the function looks to me
even more arbitrary. I agree with you that eventually has_run_once may
be moved at the very end but maybe this can be done later once timer,
pmu tests haven ben written
> 
> 2. There are many ways that kvm_vgic_map_resources() can fail, other than
> incorrect user settings. I started digging into how
> kvm_vgic_map_resources()->vgic_v2_map_resources() can fail for a VGIC V2 and 
> this
> is what I managed to find before I gave up:
> 
> * vgic_init() can fail in:
>     - kvm_vgic_dist_init()
>     - vgic_v3_init()
>     - kvm_vgic_setup_default_irq_routing()
> * vgic_register_dist_iodev() can fail in:
>     - vgic_v3_init_dist_iodev()
>     - kvm_io_bus_register_dev()(*)
> * kvm_phys_addr_ioremap() can fail in:
>     - kvm_mmu_topup_memory_cache()
>     - kvm_pgtable_stage2_map()

I changed the commit msg so that "incorrect user settings" sounds as an
example.
> 
> So if any of the functions below fail, are we 100% sure it is safe to allow 
> the
> user to execute kvm_vgic_map_resources() again?

I think additional tests will confirm this. However at the moment,
moving the assignment, which does not look wrong to me, allows to
greatly simplify the tests so I would tend to say that it is worth.
> 
> (*) It looks to me like kvm_io_bus_register_dev() doesn't take into account a
> caller that tries to register the same device address range and it will create
> another identical range. Is this intentional? Is it a bug that should be 
> fixed? Or
> am I misunderstanding the function?

doesn't kvm_io_bus_cmp() do the check?

Thanks

Eric
> 
> Thanks,
> Alex
>> +
>>  ret = kvm_timer_enable(vcpu);
>>  if (ret)
>>  return ret;
> 



Re: [PATCH 1/9] KVM: arm64: vgic-v3: Fix some error codes when setting RDIST base

2021-01-14 Thread Auger Eric
Hi Alexandru,

On 1/6/21 5:32 PM, Alexandru Elisei wrote:
> Hi Eric,
> 
> On 12/12/20 6:50 PM, Eric Auger wrote:
>> KVM_DEV_ARM_VGIC_GRP_ADDR group doc says we should return
>> -EEXIST in case the base address of the redist is already set.
>> We currently return -EINVAL.
>>
>> However we need to return -EINVAL in case a legacy REDIST address
>> is attempted to be set while REDIST_REGIONS were set. This case
>> is discriminated by looking at the count field.
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  arch/arm64/kvm/vgic/vgic-mmio-v3.c | 9 +++--
>>  1 file changed, 7 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
>> b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> index 15a6c98ee92f..8e8a862def76 100644
>> --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> @@ -792,8 +792,13 @@ static int vgic_v3_insert_redist_region(struct kvm 
>> *kvm, uint32_t index,
>>  int ret;
>>  
>>  /* single rdist region already set ?*/
>> -if (!count && !list_empty(rd_regions))
>> -return -EINVAL;
>> +if (!count && !list_empty(rd_regions)) {
>> +rdreg = list_last_entry(rd_regions,
>> +   struct vgic_redist_region, list);
>> +if (rdreg->count)
>> +return -EINVAL; /* Mixing REDIST and REDIST_REGION API 
>> */
>> +return -EEXIST;
>> +}
> 
> A few instructions below:
> 
>     if (list_empty(rd_regions)) {
>         [..]
>     } else {
>         rdreg = list_last_entry(rd_regions,
>                     struct vgic_redist_region, list);
>         [..]
> 
>         /* Cannot add an explicitly sized regions after legacy region */
>         if (!rdreg->count)
>             return -EINVAL;
>     }
> 
> Isn't this testing for the same thing, but using the opposite condition? Or 
> am I
> misunderstanding the code (quite likely)?
the 1st test sequence handles the case where the legacy
KVM_VGIC_V3_ADDR_TYPE_REDIST is used (!count) while the second handles
the case where the REDIST_REGION is used. Nevertheless I think this can
be simplified into:

if (list_empty(rd_regions)) {
if (index != 0)
return -EINVAL;
} else {
rdreg = list_last_entry(rd_regions,
struct vgic_redist_region, list);

if ((!count) != (!rdreg->count))
return -EINVAL; /* Mix REDIST and REDIST_REGION */

if (!count)
return -EEXIST;

if (index != rdreg->index + 1)
return -EINVAL;
}






> 
> Looks to me like 
> KVM_DEV_ARM_VGIC_GRP_ADDR(KVM_VGIC_V3_ADDR_TYPE_REDIST{,_REGION})
> used to return -EEXIST (from vgic_check_ioaddr()) before commit ccc27bf5be7b7
> ("KVM: arm/arm64: Helper to register a new redistributor region") which added 
> the
> vgic_v3_insert_redist_region() function, so bringing back the -EEXIST return 
> code
> looks the right thing to me.

OK thank you for the detailed study.

Eric
> 
> Thanks,
> Alex
>>  
>>  /* cross the end of memory ? */
>>  if (base + size < base)
> 



Re: [PATCH v4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-13 Thread Eric Snowberg


> On Jan 13, 2021, at 1:41 PM, Jarkko Sakkinen 
>  wrote:
> 
> On Tue, Jan 12, 2021 at 02:57:39PM +, David Howells wrote:
>> Eric Snowberg  wrote:
>> 
>>>> On Dec 10, 2020, at 2:49 AM, David Howells  wrote:
>>>> 
>>>> Eric Snowberg  wrote:
>>>> 
>>>>> Add support for EFI_CERT_X509_GUID dbx entries. When a EFI_CERT_X509_GUID
>>>>> is found, it is added as an asymmetrical key to the .blacklist keyring.
>>>>> Anytime the .platform keyring is used, the keys in the .blacklist keyring
>>>>> are referenced, if a matching key is found, the key will be rejected.
>>>> 
>>>> Ummm...  Why this way and not as a blacklist key which takes up less space?
>>>> I'm guessing that you're using the key chain matching logic.  We really 
>>>> only
>>>> need to blacklist the key IDs.
>>> 
>>> I implemented it this way so that certs in the dbx would only impact 
>>> the .platform keyring. I was under the impression we didn’t want to have 
>>> Secure Boot UEFI db/dbx certs dictate keyring functionality within the 
>>> kernel
>>> itself. Meaning if we have a matching dbx cert in any other keyring 
>>> (builtin,
>>> secondary, ima, etc.), it would be allowed. If that is not how you’d like 
>>> to 
>>> see it done, let me know and I’ll make the change.
>> 
>> I wonder if that is that the right thing to do.  I guess this is a policy
>> decision and may depend on the particular user.
> 
> Why would you want to allow dbx entry in any keyring?

Today, DB and MOK certs go into the platform keyring.  These certs are only
referenced during kexec.  They can’t be used for other things like validating
kernel module signatures.  If we follow the same pattern, the DBX and MOKX 
entries
in the blacklist keyring should only impact kexec. 

Currently, Mickaël Salaün has another outstanding series to allow root to 
update 
the blacklist keyring.  I assume the use case for this is around certificates 
used 
within the kernel, for example revoking kernel module signatures.  The question 
I have
is, should another keyring be introduced?  One that carries DBX and MOKX, which 
just
correspond to certs/hashes in the platform keyring; this keyring would only be
referenced for kexec, just like the platform keyring is today. Then, the current
blacklist keyring would be used for everything internal to the kernel.



Re: [PATCH] tcp: fix TCP_USER_TIMEOUT with zero window

2021-01-13 Thread Eric Dumazet
On Wed, Jan 13, 2021 at 9:12 PM Enke Chen  wrote:
>
> From: Enke Chen 
>
> The TCP session does not terminate with TCP_USER_TIMEOUT when data
> remain untransmitted due to zero window.
>
> The number of unanswered zero-window probes (tcp_probes_out) is
> reset to zero with incoming acks irrespective of the window size,
> as described in tcp_probe_timer():
>
> RFC 1122 4.2.2.17 requires the sender to stay open indefinitely
> as long as the receiver continues to respond probes. We support
> this by default and reset icsk_probes_out with incoming ACKs.
>
> This counter, however, is the wrong one to be used in calculating the
> duration that the window remains closed and data remain untransmitted.
> Thanks to Jonathan Maxwell  for diagnosing the
> actual issue.
>
> In this patch a separate counter is introduced to track the number of
> zero-window probes that are not answered with any non-zero window ack.
> This new counter is used in determining when to abort the session with
> TCP_USER_TIMEOUT.
>

I think one possible issue would be that local congestion (full qdisc)
would abort early,
because tcp_model_timeout() assumes linear backoff.

Neal or Yuchung can further comment on that, it is late for me in France.

packetdrill test would be :

   0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
   +0 bind(3, ..., ...) = 0
   +0 listen(3, 1) = 0


   +0 < S 0:0(0) win 0 
   +0 > S. 0:0(0) ack 1 

  +.1 < . 1:1(0) ack 1 win 65530
   +0 accept(3, ..., ...) = 4

   +0 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0
   +0 write(4, ..., 24) = 24
   +0 > P. 1:25(24) ack 1
   +.1 < . 1:1(0) ack 25 win 65530
   +0 %{ assert tcpi_probes == 0, tcpi_probes; \
 assert tcpi_backoff == 0, tcpi_backoff }%

// install a qdisc dropping all packets
   +0 `tc qdisc delete dev tun0 root 2>/dev/null ; tc qdisc add dev
tun0 root pfifo limit 0`
   +0 write(4, ..., 24) = 24
   // When qdisc is congested we retry every 500ms therefore in theory
   // we'd retry 6 times before hitting 3s timeout. However, since we
   // estimate the elapsed time based on exp backoff of actual RTO (300ms),
   // we'd bail earlier with only 3 probes.
   +2.1 write(4, ..., 24) = -1
   +0 %{ assert tcpi_probes == 3, tcpi_probes; \
 assert tcpi_backoff == 0, tcpi_backoff }%
   +0 close(4) = 0

> Cc: sta...@vger.kernel.org
> Fixes: 9721e709fa68 ("tcp: simplify window probe aborting on USER_TIMEOUT")
> Reported-by: William McCall 
> Signed-off-by: Enke Chen 
> ---
>  include/linux/tcp.h   | 5 +
>  net/ipv4/tcp.c| 1 +
>  net/ipv4/tcp_input.c  | 3 ++-
>  net/ipv4/tcp_output.c | 2 ++
>  net/ipv4/tcp_timer.c  | 5 +++--
>  5 files changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 2f87377e9af7..c9415b30fa67 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -352,6 +352,11 @@ struct tcp_sock {
>
> int linger2;
>
> +   /* While icsk_probes_out is for unanswered 0 window probes, this
> +* counter is for 0-window probes that are not answered with any
> +* non-zero window (nzw) acks.
> +*/
> +   u8  probes_nzw;
>
>  /* Sock_ops bpf program related variables */
>  #ifdef CONFIG_BPF
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index ed42d2193c5c..af6a41a5a5ac 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -2940,6 +2940,7 @@ int tcp_disconnect(struct sock *sk, int flags)
> icsk->icsk_rto = TCP_TIMEOUT_INIT;
> icsk->icsk_rto_min = TCP_RTO_MIN;
> icsk->icsk_delack_max = TCP_DELACK_MAX;
> +   tp->probes_nzw = 0;
> tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
> tp->snd_cwnd = TCP_INIT_CWND;
> tp->snd_cwnd_cnt = 0;
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index c7e16b0ed791..4812a969c18a 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3377,13 +3377,14 @@ static void tcp_ack_probe(struct sock *sk)
>  {
> struct inet_connection_sock *icsk = inet_csk(sk);
> struct sk_buff *head = tcp_send_head(sk);
> -   const struct tcp_sock *tp = tcp_sk(sk);
> +   struct tcp_sock *tp = tcp_sk(sk);
>
> /* Was it a usable window open? */
> if (!head)
> return;
> if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
> icsk->icsk_backoff = 0;
> +   tp->probes_nzw = 0;
> inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
> /* Socket must be waked up by subsequent tcp_data_snd_check().
>  * This function is not for random using!
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index f322e798a351..1b64cdabc299 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -4084,10 +4084,12 @@ void tcp_send_probe0(struct sock *sk)
> /* Cancel probe timer, if it is not required. */
> 

Re: [PATCH 3/9] KVM: arm64: vgic-v3: Fix error handling in vgic_v3_set_redist_base()

2021-01-13 Thread Auger Eric
Hi Marc,

On 12/28/20 4:35 PM, Marc Zyngier wrote:
> Hi Eric,
> 
> On Sat, 12 Dec 2020 18:50:04 +0000,
> Eric Auger  wrote:
>>
>> vgic_register_all_redist_iodevs may succeed while
>> vgic_register_all_redist_iodevs fails. For example this can happen
> 
> The same function cannot both fail and succeed ;-) Can you shed some
> light on what you had in mind?

Damn, I meant vgic_v3_insert_redist_region() can be successful and then
vgic_register_all_redist_iodevs() fails due to detection of overlap.
> 
>> while adding a redistributor region overlapping a dist region. The
>> failure only is detected on vgic_register_all_redist_iodevs when
>> vgic_v3_check_base() gets called.
>>
>> In such a case, remove the newly added redistributor region and free
>> it.
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  arch/arm64/kvm/vgic/vgic-mmio-v3.c | 8 +++-
>>  1 file changed, 7 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
>> b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> index 8e8a862def76..581f0f49 100644
>> --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> @@ -866,8 +866,14 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, 
>> u64 addr, u32 count)
>>   * afterwards will register the iodevs when needed.
>>   */
>>  ret = vgic_register_all_redist_iodevs(kvm);
>> -if (ret)
>> +if (ret) {
>> +struct vgic_redist_region *rdreg =
>> +    vgic_v3_rdist_region_from_index(kvm, index);
>> +
> 
> nit: consider splitting declaration and assignment so that we avoid
> the line split if you insist on the 80 character limit.
Sure

Thanks

Eric
> 
>> +list_del(>list);
>> +kfree(rdreg);
>>  return ret;
>> +}
>>  
>>  return 0;
>>  }
>> -- 
>> 2.21.3
>>
>>
> 
> Thanks,
> 
>   M.
> 



Re: [PATCH 6/9] docs: kvm: devices/arm-vgic-v3: enhance KVM_DEV_ARM_VGIC_CTRL_INIT doc

2021-01-13 Thread Auger Eric
Hi Alexandru,

On 1/12/21 4:39 PM, Alexandru Elisei wrote:
> Hi Eric,
> 
> On 12/12/20 6:50 PM, Eric Auger wrote:
>> kvm_arch_vcpu_precreate() returns -EBUSY if the vgic is
>> already initialized. So let's document that KVM_DEV_ARM_VGIC_CTRL_INIT
>> must be called after all vcpu creations.
> 
> Checked and this is indeed the case,
> kvm_vm_ioctl_create_vcpu()->kvm_arch_vcpu_precreate() returns -EBUSY is
> vgic_initialized() is true.
thanks!
> 
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  Documentation/virt/kvm/devices/arm-vgic-v3.rst | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.rst 
>> b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
>> index 5dd3bff51978..322de6aebdec 100644
>> --- a/Documentation/virt/kvm/devices/arm-vgic-v3.rst
>> +++ b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
>> @@ -228,7 +228,7 @@ Groups:
>>  
>>  KVM_DEV_ARM_VGIC_CTRL_INIT
>>request the initialization of the VGIC, no additional parameter in
>> -  kvm_device_attr.addr.
>> +  kvm_device_attr.addr. Must be called after all vcpu creations.
> 
> Nitpick here: the document writes VCPU with all caps. This also sounds a bit
> weird, I think something like "Must be called after all VCPUs have been 
> created"
> is clearer.
I took your suggestion.

Thanks

Eric
> 
> Thanks,
> Alex
>>  KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES
>>save all LPI pending bits into guest RAM pending tables.
>>  
> 



Re: [PATCH 4/9] KVM: arm/arm64: vgic: Reset base address on kvm_vgic_dist_destroy()

2021-01-13 Thread Auger Eric
Hi Marc,

On 12/28/20 4:41 PM, Marc Zyngier wrote:
> On Sat, 12 Dec 2020 18:50:05 +,
> Eric Auger  wrote:
>>
>> On vgic_dist_destroy(), the addresses are not reset. However for
>> kvm selftest purpose this would allow to continue the test execution
>> even after a failure when running KVM_RUN. So let's reset the
>> base addresses.
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  arch/arm64/kvm/vgic/vgic-init.c | 7 +--
>>  1 file changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-init.c 
>> b/arch/arm64/kvm/vgic/vgic-init.c
>> index 32e32d67a127..6147bed56b1b 100644
>> --- a/arch/arm64/kvm/vgic/vgic-init.c
>> +++ b/arch/arm64/kvm/vgic/vgic-init.c
>> @@ -335,14 +335,16 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
>>  kfree(dist->spis);
>>  dist->spis = NULL;
>>  dist->nr_spis = 0;
>> +dist->vgic_dist_base = VGIC_ADDR_UNDEF;
>>  
>> -if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
>> +if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
>>  list_for_each_entry_safe(rdreg, next, >rd_regions, list) {
>>  list_del(>list);
>>  kfree(rdreg);
>>  }
>>  INIT_LIST_HEAD(>rd_regions);
>> -}
>> +} else
>> +kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
> 
> Since you have converted the hunk above to use dist->, you could do
> the same thing here. And the coding style dictates that you need {} on
> the else side as well.
sure

Thanks

Eric
> 
>>
>>  if (vgic_has_its(kvm))
>>  vgic_lpi_translation_cache_destroy(kvm);
>> @@ -362,6 +364,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
>>  vgic_flush_pending_lpis(vcpu);
>>  
>>  INIT_LIST_HEAD(_cpu->ap_list_head);
>> +vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
>>  }
>>  
>>  /* To be called with kvm->lock held */
>> -- 
>> 2.21.3
>>
>>
> 
> Thanks,
> 
>   M.
> 



Re: [PATCH 2/9] KVM: arm64: Fix KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION read

2021-01-13 Thread Auger Eric
Hi Alexandru,

On 1/6/21 6:12 PM, Alexandru Elisei wrote:
> Hi Eric,
> 
> The patch looks correct to me. kvm_vgic_addr() masks out all the bits except 
> index
> from addr, so we don't need to do it in vgic_get_common_attr():
> 
> Reviewed-by: Alexandru Elisei 
> 
> One nitpick below.
> 
> On 12/12/20 6:50 PM, Eric Auger wrote:
>> The doc says:
>> "The characteristics of a specific redistributor region can
>>  be read by presetting the index field in the attr data.
>>  Only valid for KVM_DEV_TYPE_ARM_VGIC_V3"
>>
>> Unfortunately the existing code fails to read the input attr data
>> and thus the index always is 0.
> 
> addr is allocated on the stack, I don't think it will always be 0.
I removed this statement in the commit message. Thanks!

Eric
> 
> Thanks,
> Alex
>>
>> Fixes: 04c110932225 ("KVM: arm/arm64: Implement 
>> KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION")
>> Cc: sta...@vger.kernel.org#v4.17+
>> Signed-off-by: Eric Auger 
>> ---
>>  arch/arm64/kvm/vgic/vgic-kvm-device.c | 3 +++
>>  1 file changed, 3 insertions(+)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c 
>> b/arch/arm64/kvm/vgic/vgic-kvm-device.c
>> index 44419679f91a..2f66cf247282 100644
>> --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
>> +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
>> @@ -226,6 +226,9 @@ static int vgic_get_common_attr(struct kvm_device *dev,
>>  u64 addr;
>>  unsigned long type = (unsigned long)attr->attr;
>>  
>> +if (copy_from_user(, uaddr, sizeof(addr)))
>> +return -EFAULT;
>> +
>>  r = kvm_vgic_addr(dev->kvm, type, , false);
>>  if (r)
>>  return (r == -ENODEV) ? -ENXIO : r;
> 



Re: [PATCH 7/9] KVM: arm64: Simplify argument passing to vgic_uaccess_[read|write]

2021-01-13 Thread Auger Eric
Hi Alexandru,
On 1/12/21 5:16 PM, Alexandru Elisei wrote:
> Hi Eric,
> 
> On 1/12/21 4:04 PM, Alexandru Elisei wrote:
>> Hi Eric,
>>
>> On 12/12/20 6:50 PM, Eric Auger wrote:
>>> Instead of converting the vgic_io_device handle to a kvm_io_device
>>> handled and then do the oppositive, pass a vgic_io_device pointer all
>>> along the call chain.
>> To me, it looks like the commit message describes what the patch does 
>> instead of
>> why it does it.
>>
>> What are "vgic_io_device handle" and "kvm_io_device handled"?
Yes unfortunate typo, sorry.
> 
> Sorry, I think I got it now. You were referring to the argument types struct
> vgic_io_device and struct kvm_io_device. The patch looks like a very good 
> cleanup.
> 
> How changing to commit message to sound something like this (feel free to
> ignore/change it if you think of something else):
> 
> vgic_uaccess() takes a struct vgic_io_device argument, converts it to a struct
> kvm_io_device and passes it to the read/write accessor functions, which 
> convert it
> back to a struct vgic_io_device. Avoid the indirection by passing the struct
> vgic_io_device argument directly to vgic_uaccess_{read,write).
I reworded the commit message as you suggested.

Thanks

Eric
> 
> Thanks,
> Alex
>>
>> Thanks,
>> Alex
>>> Signed-off-by: Eric Auger 
>>> ---
>>>  arch/arm64/kvm/vgic/vgic-mmio.c | 10 --
>>>  1 file changed, 4 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c 
>>> b/arch/arm64/kvm/vgic/vgic-mmio.c
>>> index b2d73fc0d1ef..48c6067fc5ec 100644
>>> --- a/arch/arm64/kvm/vgic/vgic-mmio.c
>>> +++ b/arch/arm64/kvm/vgic/vgic-mmio.c
>>> @@ -938,10 +938,9 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct 
>>> vgic_io_device *iodev,
>>> return region;
>>>  }
>>>  
>>> -static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device 
>>> *dev,
>>> +static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device 
>>> *iodev,
>>>  gpa_t addr, u32 *val)
>>>  {
>>> -   struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
>>> const struct vgic_register_region *region;
>>> struct kvm_vcpu *r_vcpu;
>>>  
>>> @@ -960,10 +959,9 @@ static int vgic_uaccess_read(struct kvm_vcpu *vcpu, 
>>> struct kvm_io_device *dev,
>>> return 0;
>>>  }
>>>  
>>> -static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device 
>>> *dev,
>>> +static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device 
>>> *iodev,
>>>   gpa_t addr, const u32 *val)
>>>  {
>>> -   struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
>>> const struct vgic_register_region *region;
>>> struct kvm_vcpu *r_vcpu;
>>>  
>>> @@ -986,9 +984,9 @@ int vgic_uaccess(struct kvm_vcpu *vcpu, struct 
>>> vgic_io_device *dev,
>>>  bool is_write, int offset, u32 *val)
>>>  {
>>> if (is_write)
>>> -   return vgic_uaccess_write(vcpu, >dev, offset, val);
>>> +   return vgic_uaccess_write(vcpu, dev, offset, val);
>>> else
>>> -   return vgic_uaccess_read(vcpu, >dev, offset, val);
>>> +   return vgic_uaccess_read(vcpu, dev, offset, val);
>>>  }
>>>  
>>>  static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device 
>>> *dev,
>> ___
>> kvmarm mailing list
>> kvm...@lists.cs.columbia.edu
>> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
> 



Re: [PATCH net-next 0/5] skbuff: introduce skbuff_heads bulking and reusing

2021-01-13 Thread Eric Dumazet
On Wed, Jan 13, 2021 at 6:03 PM Jakub Kicinski  wrote:
>
> On Wed, 13 Jan 2021 05:46:05 +0100 Eric Dumazet wrote:
> > On Wed, Jan 13, 2021 at 2:02 AM Jakub Kicinski  wrote:
> > >
> > > On Tue, 12 Jan 2021 13:23:16 +0100 Eric Dumazet wrote:
> > > > On Tue, Jan 12, 2021 at 12:08 PM Alexander Lobakin  
> > > > wrote:
> > > > >
> > > > > From: Edward Cree 
> > > > > Date: Tue, 12 Jan 2021 09:54:04 +
> > > > >
> > > > > > Without wishing to weigh in on whether this caching is a good 
> > > > > > idea...
> > > > >
> > > > > Well, we already have a cache to bulk flush "consumed" skbs, although
> > > > > kmem_cache_free() is generally lighter than kmem_cache_alloc(), and
> > > > > a page frag cache to allocate skb->head that is also bulking the
> > > > > operations, since it contains a (compound) page with the size of
> > > > > min(SZ_32K, PAGE_SIZE).
> > > > > If they wouldn't give any visible boosts, I think they wouldn't hit
> > > > > mainline.
> > > > >
> > > > > > Wouldn't it be simpler, rather than having two separate "alloc" and 
> > > > > > "flush"
> > > > > >  caches, to have a single larger cache, such that whenever it 
> > > > > > becomes full
> > > > > >  we bulk flush the top half, and when it's empty we bulk alloc the 
> > > > > > bottom
> > > > > >  half?  That should mean fewer branches, fewer instructions etc. 
> > > > > > than
> > > > > >  having to decide which cache to act upon every time.
> > > > >
> > > > > I though about a unified cache, but couldn't decide whether to flush
> > > > > or to allocate heads and how much to process. Your suggestion answers
> > > > > these questions and generally seems great. I'll try that one, thanks!
> > > >
> > > > The thing is : kmalloc() is supposed to have batches already, and nice
> > > > per-cpu caches.
> > > >
> > > > This looks like an mm issue, are we sure we want to get over it ?
> > > >
> > > > I would like a full analysis of why SLAB/SLUB does not work well for
> > > > your test workload.
> > >
> > > +1, it does feel like we're getting into mm territory
> >
> > I read the existing code, and with Edward Cree idea of reusing the
> > existing cache (storage of pointers),
> > ths now all makes sense, since there will be not much added code (and
> > new storage of 64 pointers)
> >
> > The remaining issue is to make sure KASAN will still work, we need
> > this to detect old and new bugs.
>
> IDK much about MM, but we already have a kmem_cache for skbs and now
> we're building a cache on top of a cache.  Shouldn't MM take care of
> providing a per-CPU BH-only lockless cache?

I think part of the improvement comes from bulk operations, which are
provided by mm layer.

I also note Alexander made no provision for NUMA awareness.
Probably reusing skb located on a remote node will not be ideal.


Re: [RFC PATCH v2 1/8] Use atomic type for ucounts reference counting

2021-01-13 Thread Eric W. Biederman
Alexey Gladkov  writes:

We might want to use refcount_t instead of atomic_t.  Not a big deal
either way.

> Signed-off-by: Alexey Gladkov 
> ---
>  include/linux/user_namespace.h |  2 +-
>  kernel/ucount.c| 10 +-
>  2 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 64cf8ebdc4ec..84fefa9247c4 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -92,7 +92,7 @@ struct ucounts {
>   struct hlist_node node;
>   struct user_namespace *ns;
>   kuid_t uid;
> - int count;
> + atomic_t count;
>   atomic_t ucount[UCOUNT_COUNTS];
>  };
>  
> diff --git a/kernel/ucount.c b/kernel/ucount.c
> index 11b1596e2542..0f2c7c11df19 100644
> --- a/kernel/ucount.c
> +++ b/kernel/ucount.c
> @@ -141,7 +141,8 @@ static struct ucounts *get_ucounts(struct user_namespace 
> *ns, kuid_t uid)
>  
>   new->ns = ns;
>   new->uid = uid;
> - new->count = 0;
> +
> + atomic_set(>count, 0);
>  
>   spin_lock_irq(_lock);
>   ucounts = find_ucounts(ns, uid, hashent);
> @@ -152,10 +153,10 @@ static struct ucounts *get_ucounts(struct 
> user_namespace *ns, kuid_t uid)
>   ucounts = new;
>   }
>   }
> - if (ucounts->count == INT_MAX)
> + if (atomic_read(>count) == INT_MAX)
>   ucounts = NULL;
>   else
> - ucounts->count += 1;
> + atomic_inc(>count);
>   spin_unlock_irq(_lock);
>   return ucounts;
>  }
> @@ -165,8 +166,7 @@ static void put_ucounts(struct ucounts *ucounts)
>   unsigned long flags;
>  
>   spin_lock_irqsave(_lock, flags);
> - ucounts->count -= 1;
> - if (!ucounts->count)
> + if (atomic_dec_and_test(>count))
>   hlist_del_init(>node);
>   else
>   ucounts = NULL;


This can become:
static void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;

if (atomic_dec_and_lock_irqsave(>count, _lock, flags)) 
{
hlist_del_init(>node);
spin_unlock_irqrestore(_lock);
kfree(ucounts);
}
}



Re: [RFC PATCH v2 2/8] Add a reference to ucounts for each user

2021-01-13 Thread Eric W. Biederman


The subject is wrong.  This should be:
[RFC PATCH v2 2/8] Add a reference to ucounts for each cred.

Further the explanation could use a little work.  Something along the
lines of:

For RLIMIT_NPROC and some other rlimits the user_struct that holds the
global limit is kept alive for the lifetime of a process by keeping it
in struct cred.  Add a ucounts reference to struct cred, so that
RLIMIT_NPROC can switch from using a per user limit to using a per user
per user namespace limit.

Nits about the code below.

Alexey Gladkov  writes:

> Before this, only the owner of the user namespace had an entry in ucounts.
> This entry addressed the user in the given user namespace.
>
> Now we create such an entry in ucounts for all users in the user namespace.
> Each user has only one entry for each user namespace.
>
> This commit is in preparation for migrating rlimits to ucounts.
>
> Signed-off-by: Alexey Gladkov 
> ---
>  include/linux/cred.h   |  1 +
>  include/linux/user_namespace.h |  2 ++
>  kernel/cred.c  | 17 +++--
>  kernel/ucount.c| 12 +++-
>  kernel/user_namespace.c|  1 +
>  5 files changed, 30 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/cred.h b/include/linux/cred.h
> index 18639c069263..307744fcc387 100644
> --- a/include/linux/cred.h
> +++ b/include/linux/cred.h
> @@ -144,6 +144,7 @@ struct cred {
>  #endif
>   struct user_struct *user;   /* real user ID subscription */
>   struct user_namespace *user_ns; /* user_ns the caps and keyrings are 
> relative to. */
> + struct ucounts *ucounts;
>   struct group_info *group_info;  /* supplementary groups for euid/fsgid 
> */
>   /* RCU deletion */
>   union {
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 84fefa9247c4..483568a56f7f 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -102,6 +102,8 @@ bool setup_userns_sysctls(struct user_namespace *ns);
>  void retire_userns_sysctls(struct user_namespace *ns);
>  struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum 
> ucount_type type);
>  void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
> +void put_ucounts(struct ucounts *ucounts);
> +void set_cred_ucounts(const struct cred *cred, struct user_namespace *ns, 
> kuid_t uid);
>  
>  #ifdef CONFIG_USER_NS
>  
> diff --git a/kernel/cred.c b/kernel/cred.c
> index 421b1149c651..d19e2e97092c 100644
> --- a/kernel/cred.c
> +++ b/kernel/cred.c
> @@ -119,6 +119,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
>   if (cred->group_info)
>   put_group_info(cred->group_info);
>   free_uid(cred->user);
> + put_ucounts(cred->ucounts);
>   put_user_ns(cred->user_ns);
>   kmem_cache_free(cred_jar, cred);
>  }
> @@ -144,6 +145,9 @@ void __put_cred(struct cred *cred)
>   BUG_ON(cred == current->cred);
>   BUG_ON(cred == current->real_cred);
>  
> + BUG_ON(cred->ucounts == NULL);
> + BUG_ON(cred->ucounts->ns != cred->user_ns);
> +
>   if (cred->non_rcu)
>   put_cred_rcu(>rcu);
>   else
> @@ -271,6 +275,9 @@ struct cred *prepare_creds(void)
>   get_uid(new->user);
>   get_user_ns(new->user_ns);
>  
> + new->ucounts = NULL;
> + set_cred_ucounts(new, new->user_ns, new->euid);
> +
This hunk should be:
atomic_inc(>count);

That means you get to skip the lookup by uid and user_ns which while it
should be cheap is completely unnecessary in this case.

>  #ifdef CONFIG_KEYS
>   key_get(new->session_keyring);
>   key_get(new->process_keyring);
> @@ -363,6 +370,7 @@ int copy_creds(struct task_struct *p, unsigned long 
> clone_flags)
>   ret = create_user_ns(new);
>   if (ret < 0)
>   goto error_put;
> + set_cred_ucounts(new, new->user_ns, new->euid);
>   }
>  
>  #ifdef CONFIG_KEYS
> @@ -485,8 +493,11 @@ int commit_creds(struct cred *new)
>* in set_user().
>*/
>   alter_cred_subscribers(new, 2);
> - if (new->user != old->user)
> - atomic_inc(>user->processes);
> + if (new->user != old->user || new->user_ns != old->user_ns) {
> + if (new->user != old->user)
> + atomic_inc(>user->processes);
> + set_cred_ucounts(new, new->user_ns, new->euid);
> + }
>   rcu_assign_pointer(task->real_cred, new);
>   rcu_assign_pointer(task->cred, new);
>   if (new->user != old->user)
> @@ -661,6 +672,7 @@ void __init cred_init(void)
>   /* allocate a slab in which we can store credentials */
>   cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
>   SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
> + set_cred_ucounts(_cred, _user_ns, GLOBAL_ROOT_UID);
Unfortuantely this is needed here because this is the first cred
and there is no ucount reference to copy.
>  }
>  
> 

Re: [PATCH v13 00/15] SMMUv3 Nested Stage Setup (IOMMU part)

2021-01-13 Thread Auger Eric
Hi Shameer,

On 1/8/21 6:05 PM, Shameerali Kolothum Thodi wrote:
> Hi Eric,
> 
>> -Original Message-----
>> From: Eric Auger [mailto:eric.au...@redhat.com]
>> Sent: 18 November 2020 11:22
>> To: eric.auger@gmail.com; eric.au...@redhat.com;
>> io...@lists.linux-foundation.org; linux-kernel@vger.kernel.org;
>> k...@vger.kernel.org; kvm...@lists.cs.columbia.edu; w...@kernel.org;
>> j...@8bytes.org; m...@kernel.org; robin.mur...@arm.com;
>> alex.william...@redhat.com
>> Cc: jean-phili...@linaro.org; zhangfei@linaro.org;
>> zhangfei@gmail.com; vivek.gau...@arm.com; Shameerali Kolothum
>> Thodi ;
>> jacob.jun@linux.intel.com; yi.l@intel.com; t...@semihalf.com;
>> nicoleots...@gmail.com; yuzenghui 
>> Subject: [PATCH v13 00/15] SMMUv3 Nested Stage Setup (IOMMU part)
>>
>> This series brings the IOMMU part of HW nested paging support
>> in the SMMUv3. The VFIO part is submitted separately.
>>
>> The IOMMU API is extended to support 2 new API functionalities:
>> 1) pass the guest stage 1 configuration
>> 2) pass stage 1 MSI bindings
>>
>> Then those capabilities gets implemented in the SMMUv3 driver.
>>
>> The virtualizer passes information through the VFIO user API
>> which cascades them to the iommu subsystem. This allows the guest
>> to own stage 1 tables and context descriptors (so-called PASID
>> table) while the host owns stage 2 tables and main configuration
>> structures (STE).
> 
> I am seeing an issue with Guest testpmd run with this series.
> I have two different setups and testpmd works fine with the
> first one but not with the second.
> 
> 1). Guest doesn't have kernel driver built-in for pass-through dev.
> 
> root@ubuntu:/# lspci -v
> ...
> 00:02.0 Ethernet controller: Huawei Technologies Co., Ltd. Device a22e (rev 
> 21)
> Subsystem: Huawei Technologies Co., Ltd. Device 
> Flags: fast devsel
> Memory at 800010 (64-bit, prefetchable) [disabled] [size=64K]
> Memory at 80 (64-bit, prefetchable) [disabled] [size=1M]
> Capabilities: [40] Express Root Complex Integrated Endpoint, MSI 00
> Capabilities: [a0] MSI-X: Enable- Count=67 Masked-
> Capabilities: [b0] Power Management version 3
> Capabilities: [100] Access Control Services
> Capabilities: [300] Transaction Processing Hints
> 
> root@ubuntu:/# echo vfio-pci > 
> /sys/bus/pci/devices/:00:02.0/driver_override
> root@ubuntu:/# echo :00:02.0 > /sys/bus/pci/drivers_probe
> 
> root@ubuntu:/mnt/dpdk/build/app# ./testpmd -w :00:02.0 --file-prefix 
> socket0  -l 0-1 -n 2 -- -i
> EAL: Detected 8 lcore(s)
> EAL: Detected 1 NUMA nodes
> EAL: Multi-process socket /var/run/dpdk/socket0/mp_socket
> EAL: Selected IOVA mode 'VA'
> EAL: No available hugepages reported in hugepages-32768kB
> EAL: No available hugepages reported in hugepages-64kB
> EAL: No available hugepages reported in hugepages-1048576kB
> EAL: Probing VFIO support...
> EAL: VFIO support initialized
> EAL:   Invalid NUMA socket, default to 0
> EAL:   using IOMMU type 1 (Type 1)
> EAL: Probe PCI driver: net_hns3_vf (19e5:a22e) device: :00:02.0 (socket 0)
> EAL: No legacy callbacks, legacy socket not created
> Interactive-mode selected
> testpmd: create a new mbuf pool : n=155456, size=2176, 
> socket=0
> testpmd: preferred mempool ops selected: ring_mp_mc
> 
> Warning! port-topology=paired and odd forward ports number, the last port 
> will pair with itself.
> 
> Configuring Port 0 (socket 0)
> Port 0: 8E:A6:8C:43:43:45
> Checking link statuses...
> Done
> testpmd>
> 
> 2). Guest have kernel driver built-in for pass-through dev.
> 
> root@ubuntu:/# lspci -v
> ...
> 00:02.0 Ethernet controller: Huawei Technologies Co., Ltd. Device a22e (rev 
> 21)
> Subsystem: Huawei Technologies Co., Ltd. Device 
> Flags: bus master, fast devsel, latency 0
> Memory at 800010 (64-bit, prefetchable) [size=64K]
> Memory at 80 (64-bit, prefetchable) [size=1M]
> Capabilities: [40] Express Root Complex Integrated Endpoint, MSI 00
> Capabilities: [a0] MSI-X: Enable+ Count=67 Masked-
> Capabilities: [b0] Power Management version 3
> Capabilities: [100] Access Control Services
> Capabilities: [300] Transaction Processing Hints
> Kernel driver in use: hns3
> 
> root@ubuntu:/# echo vfio-pci > 
> /sys/bus/pci/devices/:00:02.0/driver_override
> root@ubuntu:/# echo :00:02.0 > /sys/bus/pci/drivers/hns3/unbind
> root@ubuntu:/# echo :00:02.0 > /sys/bus/pci/drivers_probe
> 
> root@ubuntu:/mnt/dpdk/build/app# ./testpmd -w :00:02.0 --file-prefix 
> socket0 -l 0-1 -n 2 -- -i
> EAL: Detected 8 lcore(s)
> EAL: Detected 

Re: [PATCH v2 net-next 2/3] skbuff: (re)use NAPI skb cache on allocation path

2021-01-13 Thread Eric Dumazet
On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin  wrote:
>
> Instead of calling kmem_cache_alloc() every time when building a NAPI
> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
> this cache was only used for bulk-freeing skbuff_heads consumed via
> napi_consume_skb() or __kfree_skb_defer().
>
> Typical path is:
>  - skb is queued for freeing from driver or stack, its skbuff_head
>goes into the cache instead of immediate freeing;
>  - driver or stack requests NAPI skb allocation, an skbuff_head is
>taken from the cache instead of allocation.
>
> Corner cases:
>  - if it's empty on skb allocation, bulk-allocate the first half;
>  - if it's full on skb consuming, bulk-wipe the second half.
>
> Also try to balance its size after completing network softirqs
> (__kfree_skb_flush()).

I do not see the point of doing this rebalance (especially if we do not change
its name describing its purpose more accurately).

For moderate load, we will have a reduced bulk size (typically one or two).
Number of skbs in the cache is in [0, 64[ , there is really no risk of
letting skbs there for a long period of time.
(32 * sizeof(sk_buff) = 8192)
I would personally get rid of this function completely.


Also it seems you missed my KASAN support request ?
I guess this is a matter of using kasan_unpoison_range(), we can ask for help.




>
> prefetchw() on CONFIG_SLUB is dropped since it makes no sense anymore.
>
> Suggested-by: Edward Cree 
> Signed-off-by: Alexander Lobakin 
> ---
>  net/core/skbuff.c | 54 ++-
>  1 file changed, 35 insertions(+), 19 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index dc3300dc2ac4..f42a3a04b918 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -364,6 +364,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
>  EXPORT_SYMBOL(build_skb_around);
>
>  #define NAPI_SKB_CACHE_SIZE64
> +#define NAPI_SKB_CACHE_HALF(NAPI_SKB_CACHE_SIZE / 2)
>
>  struct napi_alloc_cache {
> struct page_frag_cache page;
> @@ -487,7 +488,15 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
>
>  static struct sk_buff *napi_skb_cache_get(struct napi_alloc_cache *nc)
>  {
> -   return kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
> +   if (unlikely(!nc->skb_count))
> +   nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
> + GFP_ATOMIC,
> + NAPI_SKB_CACHE_HALF,
> + nc->skb_cache);
> +   if (unlikely(!nc->skb_count))
> +   return NULL;
> +
> +   return nc->skb_cache[--nc->skb_count];
>  }
>
>  /**
> @@ -867,40 +876,47 @@ void __consume_stateless_skb(struct sk_buff *skb)
>  void __kfree_skb_flush(void)
>  {
> struct napi_alloc_cache *nc = this_cpu_ptr(_alloc_cache);
> +   size_t count;
> +   void **ptr;
> +
> +   if (unlikely(nc->skb_count == NAPI_SKB_CACHE_HALF))
> +   return;
> +
> +   if (nc->skb_count > NAPI_SKB_CACHE_HALF) {
> +   count = nc->skb_count - NAPI_SKB_CACHE_HALF;
> +   ptr = nc->skb_cache + NAPI_SKB_CACHE_HALF;
>
> -   /* flush skb_cache if containing objects */
> -   if (nc->skb_count) {
> -   kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
> -nc->skb_cache);
> -   nc->skb_count = 0;
> +   kmem_cache_free_bulk(skbuff_head_cache, count, ptr);
> +   nc->skb_count = NAPI_SKB_CACHE_HALF;
> +   } else {
> +   count = NAPI_SKB_CACHE_HALF - nc->skb_count;
> +   ptr = nc->skb_cache + nc->skb_count;
> +
> +   nc->skb_count += kmem_cache_alloc_bulk(skbuff_head_cache,
> +  GFP_ATOMIC, count,
> +  ptr);
> }
>  }
>


Re: [PATCH net-next 0/5] skbuff: introduce skbuff_heads bulking and reusing

2021-01-12 Thread Eric Dumazet
On Wed, Jan 13, 2021 at 2:02 AM Jakub Kicinski  wrote:
>
> On Tue, 12 Jan 2021 13:23:16 +0100 Eric Dumazet wrote:
> > On Tue, Jan 12, 2021 at 12:08 PM Alexander Lobakin  wrote:
> > >
> > > From: Edward Cree 
> > > Date: Tue, 12 Jan 2021 09:54:04 +
> > >
> > > > Without wishing to weigh in on whether this caching is a good idea...
> > >
> > > Well, we already have a cache to bulk flush "consumed" skbs, although
> > > kmem_cache_free() is generally lighter than kmem_cache_alloc(), and
> > > a page frag cache to allocate skb->head that is also bulking the
> > > operations, since it contains a (compound) page with the size of
> > > min(SZ_32K, PAGE_SIZE).
> > > If they wouldn't give any visible boosts, I think they wouldn't hit
> > > mainline.
> > >
> > > > Wouldn't it be simpler, rather than having two separate "alloc" and 
> > > > "flush"
> > > >  caches, to have a single larger cache, such that whenever it becomes 
> > > > full
> > > >  we bulk flush the top half, and when it's empty we bulk alloc the 
> > > > bottom
> > > >  half?  That should mean fewer branches, fewer instructions etc. than
> > > >  having to decide which cache to act upon every time.
> > >
> > > I though about a unified cache, but couldn't decide whether to flush
> > > or to allocate heads and how much to process. Your suggestion answers
> > > these questions and generally seems great. I'll try that one, thanks!
> >
> > The thing is : kmalloc() is supposed to have batches already, and nice
> > per-cpu caches.
> >
> > This looks like an mm issue, are we sure we want to get over it ?
> >
> > I would like a full analysis of why SLAB/SLUB does not work well for
> > your test workload.
>
> +1, it does feel like we're getting into mm territory

I read the existing code, and with Edward Cree idea of reusing the
existing cache (storage of pointers),
ths now all makes sense, since there will be not much added code (and
new storage of 64 pointers)

The remaining issue is to make sure KASAN will still work, we need
this to detect old and new bugs.

Thanks !


Re: [PATCH] tcp: keepalive fixes

2021-01-12 Thread Eric Dumazet
On Tue, Jan 12, 2021 at 11:48 PM Yuchung Cheng  wrote:
>
> On Tue, Jan 12, 2021 at 2:31 PM Enke Chen  wrote:
> >
> > From: Enke Chen 
> >
> > In this patch two issues with TCP keepalives are fixed:
> >
> > 1) TCP keepalive does not timeout when there are data waiting to be
> >delivered and then the connection got broken. The TCP keepalive
> >timeout is not evaluated in that condition.
> hi enke
> Do you have an example to demonstrate this issue -- in theory when
> there is data inflight, an RTO timer should be pending (which
> considers user-timeout setting). based on the user-timeout description
> (man tcp), the user timeout should abort the socket per the specified
> time after data commences. some data would help to understand the
> issue.
>

+1

A packetdrill test would be ideal.

Also, given that there is this ongoing issue with TCP_USER_TIMEOUT,
lets not mix things
or risk added work for backports to stable versions.


[PATCH RESEND] random: fix the RNDRESEEDCRNG ioctl

2021-01-12 Thread Eric Biggers
From: Eric Biggers 

The RNDRESEEDCRNG ioctl reseeds the primary_crng from itself, which
doesn't make sense.  Reseed it from the input_pool instead.

Fixes: d848e5f8e1eb ("random: add new ioctl RNDRESEEDCRNG")
Cc: sta...@vger.kernel.org
Cc: linux-cry...@vger.kernel.org
Cc: Andy Lutomirski 
Cc: Jann Horn 
Cc: Theodore Ts'o 
Reviewed-by: Jann Horn 
Signed-off-by: Eric Biggers 
---

Andrew, please consider taking this patch since the maintainer has been
ignoring it for 4 months
(https://lkml.kernel.org/lkml/20200916041908.66649-1-ebigg...@kernel.org/T/#u).


 drivers/char/random.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 5f3b8ac9d97b0..a894c0559a8cf 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1972,7 +1972,7 @@ static long random_ioctl(struct file *f, unsigned int 
cmd, unsigned long arg)
return -EPERM;
if (crng_init < 2)
return -ENODATA;
-   crng_reseed(_crng, NULL);
+   crng_reseed(_crng, _pool);
crng_global_init_time = jiffies - 1;
return 0;
default:
-- 
2.30.0



[PATCH RESEND] random: remove dead code left over from blocking pool

2021-01-12 Thread Eric Biggers
From: Eric Biggers 

Remove some dead code that was left over following commit 90ea1c6436d2
("random: remove the blocking pool").

Cc: linux-cry...@vger.kernel.org
Cc: Andy Lutomirski 
Cc: Jann Horn 
Cc: Theodore Ts'o 
Reviewed-by: Andy Lutomirski 
Signed-off-by: Eric Biggers 
---

Andrew, please consider taking this patch since the maintainer has been
ignoring it for 4 months
(https://lkml.kernel.org/lkml/20200916043652.96640-1-ebigg...@kernel.org/T/#u).


 drivers/char/random.c | 17 ++-
 include/trace/events/random.h | 83 ---
 2 files changed, 3 insertions(+), 97 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index a894c0559a8cf..bbc5098b1a81f 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -500,7 +500,6 @@ struct entropy_store {
unsigned short add_ptr;
unsigned short input_rotate;
int entropy_count;
-   unsigned int initialized:1;
unsigned int last_data_init:1;
__u8 last_data[EXTRACT_SIZE];
 };
@@ -660,7 +659,7 @@ static void process_random_ready_list(void)
  */
 static void credit_entropy_bits(struct entropy_store *r, int nbits)
 {
-   int entropy_count, orig, has_initialized = 0;
+   int entropy_count, orig;
const int pool_size = r->poolinfo->poolfracbits;
int nfrac = nbits << ENTROPY_SHIFT;
 
@@ -717,23 +716,14 @@ static void credit_entropy_bits(struct entropy_store *r, 
int nbits)
if (cmpxchg(>entropy_count, orig, entropy_count) != orig)
goto retry;
 
-   if (has_initialized) {
-   r->initialized = 1;
-   kill_fasync(, SIGIO, POLL_IN);
-   }
-
trace_credit_entropy_bits(r->name, nbits,
  entropy_count >> ENTROPY_SHIFT, _RET_IP_);
 
if (r == _pool) {
int entropy_bits = entropy_count >> ENTROPY_SHIFT;
 
-   if (crng_init < 2) {
-   if (entropy_bits < 128)
-   return;
+   if (crng_init < 2 && entropy_bits >= 128)
crng_reseed(_crng, r);
-   entropy_bits = ENTROPY_BITS(r);
-   }
}
 }
 
@@ -1385,8 +1375,7 @@ static size_t account(struct entropy_store *r, size_t 
nbytes, int min,
 }
 
 /*
- * This function does the actual extraction for extract_entropy and
- * extract_entropy_user.
+ * This function does the actual extraction for extract_entropy.
  *
  * Note: we assume that .poolwords is a multiple of 16 words.
  */
diff --git a/include/trace/events/random.h b/include/trace/events/random.h
index 9570a10cb949b..3d7b432ca5f31 100644
--- a/include/trace/events/random.h
+++ b/include/trace/events/random.h
@@ -85,28 +85,6 @@ TRACE_EVENT(credit_entropy_bits,
  __entry->entropy_count, (void *)__entry->IP)
 );
 
-TRACE_EVENT(push_to_pool,
-   TP_PROTO(const char *pool_name, int pool_bits, int input_bits),
-
-   TP_ARGS(pool_name, pool_bits, input_bits),
-
-   TP_STRUCT__entry(
-   __field( const char *,  pool_name   )
-   __field(  int,  pool_bits   )
-   __field(  int,  input_bits  )
-   ),
-
-   TP_fast_assign(
-   __entry->pool_name  = pool_name;
-   __entry->pool_bits  = pool_bits;
-   __entry->input_bits = input_bits;
-   ),
-
-   TP_printk("%s: pool_bits %d input_pool_bits %d",
- __entry->pool_name, __entry->pool_bits,
- __entry->input_bits)
-);
-
 TRACE_EVENT(debit_entropy,
TP_PROTO(const char *pool_name, int debit_bits),
 
@@ -161,35 +139,6 @@ TRACE_EVENT(add_disk_randomness,
  MINOR(__entry->dev), __entry->input_bits)
 );
 
-TRACE_EVENT(xfer_secondary_pool,
-   TP_PROTO(const char *pool_name, int xfer_bits, int request_bits,
-int pool_entropy, int input_entropy),
-
-   TP_ARGS(pool_name, xfer_bits, request_bits, pool_entropy,
-   input_entropy),
-
-   TP_STRUCT__entry(
-   __field( const char *,  pool_name   )
-   __field(  int,  xfer_bits   )
-   __field(  int,  request_bits)
-   __field(  int,  pool_entropy)
-   __field(  int,  input_entropy   )
-   ),
-
-   TP_fast_assign(
-   __entry->pool_name  = pool_name;
-   __entry->xfer_bits  = xfer_bits;
-   __entry->request_bits   = request_bits;
-   __entry->pool_entropy   = pool_entropy;
-   __entry->input_entropy  = input_entropy;
-   ),
-
-   TP_printk("pool %s xfer_bits %d request_bits %d pool_entropy %d "
-   

[PATCH RESEND] random: initialize ChaCha20 constants with correct endianness

2021-01-12 Thread Eric Biggers
From: Eric Biggers 

On big endian CPUs, the ChaCha20-based CRNG is using the wrong
endianness for the ChaCha20 constants.

This doesn't matter cryptographically, but technically it means it's not
ChaCha20 anymore.  Fix it to always use the standard constants.

Cc: linux-cry...@vger.kernel.org
Cc: Andy Lutomirski 
Cc: Jann Horn 
Cc: Theodore Ts'o 
Acked-by: Herbert Xu 
Signed-off-by: Eric Biggers 
---

Andrew, please consider taking this patch since the maintainer has been
ignoring it for 4 months
(https://lkml.kernel.org/lkml/20200916045013.142179-1-ebigg...@kernel.org/T/#u).


 drivers/char/random.c   | 4 ++--
 include/crypto/chacha.h | 9 +++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index bbc5098b1a81f..4037a1e0fb748 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -809,7 +809,7 @@ static bool __init crng_init_try_arch_early(struct 
crng_state *crng)
 
 static void __maybe_unused crng_initialize_secondary(struct crng_state *crng)
 {
-   memcpy(>state[0], "expand 32-byte k", 16);
+   chacha_init_consts(crng->state);
_get_random_bytes(>state[4], sizeof(__u32) * 12);
crng_init_try_arch(crng);
crng->init_time = jiffies - CRNG_RESEED_INTERVAL - 1;
@@ -817,7 +817,7 @@ static void __maybe_unused crng_initialize_secondary(struct 
crng_state *crng)
 
 static void __init crng_initialize_primary(struct crng_state *crng)
 {
-   memcpy(>state[0], "expand 32-byte k", 16);
+   chacha_init_consts(crng->state);
_extract_entropy(_pool, >state[4], sizeof(__u32) * 12, 0);
if (crng_init_try_arch_early(crng) && trust_cpu) {
invalidate_batched_entropy();
diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h
index 3a1c72fdb7cf5..dabaee6987186 100644
--- a/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
@@ -47,13 +47,18 @@ static inline void hchacha_block(const u32 *state, u32 
*out, int nrounds)
hchacha_block_generic(state, out, nrounds);
 }
 
-void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv);
-static inline void chacha_init_generic(u32 *state, const u32 *key, const u8 
*iv)
+static inline void chacha_init_consts(u32 *state)
 {
state[0]  = 0x61707865; /* "expa" */
state[1]  = 0x3320646e; /* "nd 3" */
state[2]  = 0x79622d32; /* "2-by" */
state[3]  = 0x6b206574; /* "te k" */
+}
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv);
+static inline void chacha_init_generic(u32 *state, const u32 *key, const u8 
*iv)
+{
+   chacha_init_consts(state);
state[4]  = key[0];
state[5]  = key[1];
state[6]  = key[2];
-- 
2.30.0



Re: [PATCH net-next 0/5] skbuff: introduce skbuff_heads bulking and reusing

2021-01-12 Thread Eric Dumazet
On Tue, Jan 12, 2021 at 7:26 PM Alexander Lobakin  wrote:
>
> From: Eric Dumazet 
> Date: Tue, 12 Jan 2021 13:32:56 +0100
>
> > On Tue, Jan 12, 2021 at 11:56 AM Alexander Lobakin  wrote:
> >>
> >
> >>
> >> Ah, I should've mentioned that I use UDP GRO Fraglists, so these
> >> numbers are for GRO.
> >>
> >
> > Right, this suggests UDP GRO fraglist is a pathological case of GRO,
> > not saving memory.
> >
> > Real GRO (TCP in most cases) will consume one skb, and have page
> > fragments for each segment.
> >
> > Having skbs linked together is not cache friendly.
>
> OK, so I rebased test setup a bit to clarify the things out.
>
> I disabled fraglists and GRO/GSO fraglists support advertisement
> in driver to exclude any "pathological" cases and switched it
> from napi_get_frags() + napi_gro_frags() to napi_alloc_skb() +
> napi_gro_receive() to disable local skb reusing (napi_reuse_skb()).
> I also enabled GSO UDP L4 ("classic" one: one skbuff_head + frags)
> for forwarding, not only local traffic, and disabled NF flow offload
> to increase CPU loading and drop performance below link speed so I
> could see the changes.
>
> So, the traffic flows looked like:
>  - TCP GRO (one head + frags) -> NAT -> hardware TSO;
>  - UDP GRO (one head + frags) -> NAT -> driver-side GSO.
>
> Baseline 5.11-rc3:
>  - 865 Mbps TCP, 866 Mbps UDP.
>
> This patch (both separate caches and Edward's unified cache):
>  - 899 Mbps TCP, 893 Mbps UDP.
>
> So that's cleary *not* only "pathological" UDP GRO Fraglists
> "problem" as TCP also got ~35 Mbps from this, as well as
> non-fraglisted UDP.
>
> Regarding latencies: I remember there were talks about latencies when
> Edward introduced batched GRO (using linked lists to pass skbs from
> GRO layer to core stack instead of passing one by one), so I think
> it's a perennial question when it comes to batching/caching.
>
> Thanks for the feedback, will post v2 soon.
> The question about if this caching is reasonable isn't closed anyway,
> but I don't see significant "cons" for now.
>

Also it would be nice to have KASAN support.

We do not want to unconditionally to recycle stuff, since this might
hide use-after-free.


Re: [PATCH v4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-12 Thread Eric Snowberg


> On Jan 12, 2021, at 10:10 AM, David Howells  wrote:
> 
> How about the attached?

This looks good to me.

> I've changed the function names to something that I
> think reads better, but otherwise it's the same.

I agree, the function name changes you made sound better.

We are starting to see platforms with KEK signed DBX updates containing
certs like this following boothole, so it would be great if we could get
something like this in.  I also had a follow on series that allowed these
certs to be compiled into the kernel.

https://lkml.org/lkml/2020/9/30/1301

I’d appreciate any feedback on that series as well.

Thanks

> David
> ---
> commit 8913866babb96fcfe452aac6042ca8862d4c0b53
> Author: Eric Snowberg 
> Date:   Tue Sep 15 20:49:27 2020 -0400
> 
>certs: Add EFI_CERT_X509_GUID support for dbx entries
> 
>The Secure Boot Forbidden Signature Database, dbx, contains a list of now
>revoked signatures and keys previously approved to boot with UEFI Secure
>Boot enabled.  The dbx is capable of containing any number of
>EFI_CERT_X509_SHA256_GUID, EFI_CERT_SHA256_GUID, and EFI_CERT_X509_GUID
>entries.
> 
>Currently when EFI_CERT_X509_GUID are contained in the dbx, the entries are
>skipped.
> 
>Add support for EFI_CERT_X509_GUID dbx entries. When a EFI_CERT_X509_GUID
>is found, it is added as an asymmetrical key to the .blacklist keyring.
>Anytime the .platform keyring is used, the keys in the .blacklist keyring
>are referenced, if a matching key is found, the key will be rejected.
> 
>Signed-off-by: Eric Snowberg 
>Reviewed-by: Jarkko Sakkinen 
>Signed-off-by: David Howells 
> 
> diff --git a/certs/blacklist.c b/certs/blacklist.c
> index 6514f9ebc943..a7f021878a4b 100644
> --- a/certs/blacklist.c
> +++ b/certs/blacklist.c
> @@ -100,6 +100,38 @@ int mark_hash_blacklisted(const char *hash)
>   return 0;
> }
> 
> +int add_key_to_revocation_list(const char *data, size_t size)
> +{
> + key_ref_t key;
> +
> + key = key_create_or_update(make_key_ref(blacklist_keyring, true),
> +"asymmetric",
> +NULL,
> +data,
> +size,
> +((KEY_POS_ALL & ~KEY_POS_SETATTR) | 
> KEY_USR_VIEW),
> +KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN);
> +
> + if (IS_ERR(key)) {
> + pr_err("Problem with revocation key (%ld)\n", PTR_ERR(key));
> + return PTR_ERR(key);
> + }
> +
> + return 0;
> +}
> +
> +int is_key_on_revocation_list(struct pkcs7_message *pkcs7)
> +{
> + int ret;
> +
> + ret = validate_trust(pkcs7, blacklist_keyring);
> +
> + if (ret == 0)
> + return -EKEYREJECTED;
> +
> + return -ENOKEY;
> +}
> +
> /**
>  * is_hash_blacklisted - Determine if a hash is blacklisted
>  * @hash: The hash to be checked as a binary blob
> diff --git a/certs/blacklist.h b/certs/blacklist.h
> index 1efd6fa0dc60..420bb7c86e07 100644
> --- a/certs/blacklist.h
> +++ b/certs/blacklist.h
> @@ -1,3 +1,15 @@
> #include 
> +#include 
> +#include 
> 
> extern const char __initconst *const blacklist_hashes[];
> +
> +#ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING
> +#define validate_trust pkcs7_validate_trust
> +#else
> +static inline int validate_trust(struct pkcs7_message *pkcs7,
> +  struct key *trust_keyring)
> +{
> + return -ENOKEY;
> +}
> +#endif
> diff --git a/certs/system_keyring.c b/certs/system_keyring.c
> index 798291177186..cc165b359ea3 100644
> --- a/certs/system_keyring.c
> +++ b/certs/system_keyring.c
> @@ -241,6 +241,12 @@ int verify_pkcs7_message_sig(const void *data, size_t 
> len,
>   pr_devel("PKCS#7 platform keyring is not available\n");
>   goto error;
>   }
> +
> + ret = is_key_on_revocation_list(pkcs7);
> + if (ret != -ENOKEY) {
> + pr_devel("PKCS#7 platform key is on revocation list\n");
> + goto error;
> + }
>   }
>   ret = pkcs7_validate_trust(pkcs7, trusted_keys);
>   if (ret < 0) {
> diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
> index fb8b07daa9d1..61f98739e8b1 100644
> --- a/include/keys/system_keyring.h
> +++ b/include/keys/system_keyring.h
> @@ -31,11 +31,14 @@ extern int restrict_link_by_builtin_and_secondary_trusted(
> #define restrict_link_by_builtin_and_secondary_trusted 
> restrict_link_by_builtin_trusted
> #

Re: [PATCH v2 01/10] vfs: move cap_convert_nscap() call into vfs_setxattr()

2021-01-12 Thread Eric W. Biederman
ebied...@xmission.com (Eric W. Biederman) writes:

> So there is the basic question do we want to read the raw bytes on disk
> or do we want to return something meaningful to the reader.  As the
> existing tools use the xattr interface to set/clear fscaps returning
> data to user space rather than raw bytes seems the perfered interface.
>
> My ideal semantics would be:
>
> - If current_user_ns() == sb->s_user_ns return the raw data.
>
>   I don't know how to implement this first scenario while permitting
>   stacked filesystems.

After a little more thought I do.

In getxattr if the get_cap method is not implemented by the
filesystem if current_user_ns() == sb->s_user_ns simply treat it as
an ordinary xattr read/write.

Otherwise call vfs_get_cap and translate the result as described
below.

The key point of this is it allows for seeing what is actually on
disk (when it is not confusing).

> - Calculate the cpu_vfs_cap_data as get_vfs_caps_from_disk does.
>   That gives the meaning of the xattr.
>
> - If "from_kuid(current_userns(), krootid) == 0" return a v2 cap.
>
> - If "rootid_owns_currentns()" return a v2 cap.
>
> - Else return an error.  Probably a permission error.
>
>   The fscap simply can not make sense to the user if the rootid does not
>   map.  Return a v2 cap would imply that the caps are present on the
>   executable (in the current context) which they are not.


Eric


Re: [PATCH v2 01/10] vfs: move cap_convert_nscap() call into vfs_setxattr()

2021-01-12 Thread Eric W. Biederman
Miklos Szeredi  writes:

> On Tue, Jan 12, 2021 at 1:15 AM Eric W. Biederman  
> wrote:
>>
>> Miklos Szeredi  writes:
>>
>> > On Fri, Jan 01, 2021 at 11:35:16AM -0600, Eric W. Biederman wrote:
>
>> > For one: a v2 fscap is supposed to be equivalent to a v3 fscap with a 
>> > rootid of
>> > zero, right?
>>
>> Yes.  This assumes that everything is translated into the uids of the
>> target filesystem.
>>
>> > If so, why does cap_inode_getsecurity() treat them differently (v2 fscap
>> > succeeding unconditionally while v3 one being either converted to v2, 
>> > rejected
>> > or left as v3 depending on current_user_ns())?
>>
>> As I understand it v2 fscaps have always succeeded unconditionally.  The
>> only case I can see for a v2 fscap might not succeed when read is if the
>> filesystem is outside of the initial user namespace.
>
> Looking again, it's rather confusing.  cap_inode_getsecurity()
> currently handles the following cases:
>
> v1: -> fails with -EINVAL
>
> v2: -> returns unconverted xattr
>
> v3:
>  a) rootid is mapped in the current namespace to non-zero:
>  -> convert rootid
>
>  b) rootid owns the current or ancerstor namespace:
>  -> convert to v2
>
>  c) rootid is not mapped and is not owner:
>  -> return -EOPNOTSUPP -> falls back to unconverted v3
>
> So lets take the example, where a tmpfs is created in a private user
> namespace and one file has a v2 cap and the other an equivalent v3 cap
> with a zero rootid.  This is the result when looking at it from
>
> 1) the namespace of the fs:
> ---
> t = cap_dac_override+eip
> tt = cap_dac_override+eip
>
> 2) the initial namespace:
> ---
> t = cap_dac_override+eip
> tt = cap_dac_override+eip [rootid=1000]
>
> 3) an unrelated namespace:
> ---
> t = cap_dac_override+eip
> tt = cap_dac_override+eip
>
> Note: in this last case getxattr will actually return a v3 cap with
> zero rootid for "tt" which getcap does not display due to being zero.
> I could do a setup with a nested namespaces that better demonstrate
> the confusing nature of this, but I think this also proves the point.

Yes.  There is real confusion on the reading case when the namespaces
do not match.

> At this point userspace simply cannot determine whether the returned
> cap is in any way valid or not.
>
> The following semantics would make a ton more sense, since getting a
> v2 would indicate that rootid is unknown:

> - if cap is v2 convert to v3 with zero rootid
> - after this, check if rootid needs to be translated, if not return v3
> - if yes, try to translate to current ns, if succeeds return translated v3
> - if not mappable, return v2
>
> Hmm?

So there is the basic question do we want to read the raw bytes on disk
or do we want to return something meaningful to the reader.  As the
existing tools use the xattr interface to set/clear fscaps returning
data to user space rather than raw bytes seems the perfered interface.

My ideal semantics would be:

- If current_user_ns() == sb->s_user_ns return the raw data.

  I don't know how to implement this first scenario while permitting
  stacked filesystems.
  
- Calculate the cpu_vfs_cap_data as get_vfs_caps_from_disk does.
  That gives the meaning of the xattr.

- If "from_kuid(current_userns(), krootid) == 0" return a v2 cap.

- If "rootid_owns_currentns()" return a v2 cap.

- Else return an error.  Probably a permission error.

  The fscap simply can not make sense to the user if the rootid does not
  map.  Return a v2 cap would imply that the caps are present on the
  executable (in the current context) which they are not.


>> > Anyway, here's a patch that I think fixes getxattr() layering for
>> > security.capability.  Does basically what you suggested.  Slight change of
>> > semantics vs. v1 caps, not sure if that is still needed, 
>> > getxattr()/setxattr()
>> > hasn't worked for these since the introduction of v3 in 4.14.
>> > Untested.
>>
>> Taking a look.  The goal of change how these operate is to make it so
>> that layered filesystems can just pass through the data if they don't
>> want to change anything (even with the user namespaces of the
>> filesystems in question are different).
>>
>> Feedback on the code below:
>> - cap_get should be in inode_operations like get_acl and set_acl.
>
> So it's not clear to me why xattr ops are per-sb and acl ops are per-inode.

I don't know why either.  What I do see is everything except
inode->i_sb->

Re: [PATCH net-next 0/5] skbuff: introduce skbuff_heads bulking and reusing

2021-01-12 Thread Eric Dumazet
On Tue, Jan 12, 2021 at 11:56 AM Alexander Lobakin  wrote:
>

>
> Ah, I should've mentioned that I use UDP GRO Fraglists, so these
> numbers are for GRO.
>

Right, this suggests UDP GRO fraglist is a pathological case of GRO,
not saving memory.

Real GRO (TCP in most cases) will consume one skb, and have page
fragments for each segment.

Having skbs linked together is not cache friendly.

So I would try first to make this case better, instead of trying to
work around the real issue.


Re: [PATCH net-next 0/5] skbuff: introduce skbuff_heads bulking and reusing

2021-01-12 Thread Eric Dumazet
On Tue, Jan 12, 2021 at 12:08 PM Alexander Lobakin  wrote:
>
> From: Edward Cree 
> Date: Tue, 12 Jan 2021 09:54:04 +
>
> > Without wishing to weigh in on whether this caching is a good idea...
>
> Well, we already have a cache to bulk flush "consumed" skbs, although
> kmem_cache_free() is generally lighter than kmem_cache_alloc(), and
> a page frag cache to allocate skb->head that is also bulking the
> operations, since it contains a (compound) page with the size of
> min(SZ_32K, PAGE_SIZE).
> If they wouldn't give any visible boosts, I think they wouldn't hit
> mainline.
>
> > Wouldn't it be simpler, rather than having two separate "alloc" and "flush"
> >  caches, to have a single larger cache, such that whenever it becomes full
> >  we bulk flush the top half, and when it's empty we bulk alloc the bottom
> >  half?  That should mean fewer branches, fewer instructions etc. than
> >  having to decide which cache to act upon every time.
>
> I though about a unified cache, but couldn't decide whether to flush
> or to allocate heads and how much to process. Your suggestion answers
> these questions and generally seems great. I'll try that one, thanks!
>


The thing is : kmalloc() is supposed to have batches already, and nice
per-cpu caches.

This looks like an mm issue, are we sure we want to get over it ?

I would like a full analysis of why SLAB/SLUB does not work well for
your test workload.

More details, more numbers before we accept yet another
'networking optimization' adding more code to the 'fast' path.

More code means more latencies when all code needs to be brought up in
cpu caches.


Re: [RFC v3 2/2] vfio/platform: msi: add Broadcom platform devices

2021-01-12 Thread Auger Eric
Hi Vikas,

On 12/14/20 6:45 PM, Vikas Gupta wrote:
> Add msi support for Broadcom platform devices
> 
> Signed-off-by: Vikas Gupta 
> ---
>  drivers/vfio/platform/Kconfig |  1 +
>  drivers/vfio/platform/Makefile|  1 +
>  drivers/vfio/platform/msi/Kconfig |  9 
>  drivers/vfio/platform/msi/Makefile|  2 +
>  .../vfio/platform/msi/vfio_platform_bcmplt.c  | 49 +++
>  5 files changed, 62 insertions(+)
>  create mode 100644 drivers/vfio/platform/msi/Kconfig
>  create mode 100644 drivers/vfio/platform/msi/Makefile
>  create mode 100644 drivers/vfio/platform/msi/vfio_platform_bcmplt.c
what does plt mean?
> 
> diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
> index dc1a3c44f2c6..7b8696febe61 100644
> --- a/drivers/vfio/platform/Kconfig
> +++ b/drivers/vfio/platform/Kconfig
> @@ -21,3 +21,4 @@ config VFIO_AMBA
> If you don't know what to do here, say N.
>  
>  source "drivers/vfio/platform/reset/Kconfig"
> +source "drivers/vfio/platform/msi/Kconfig"
> diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
> index 3f3a24e7c4ef..9ccdcdbf0e7e 100644
> --- a/drivers/vfio/platform/Makefile
> +++ b/drivers/vfio/platform/Makefile
> @@ -5,6 +5,7 @@ vfio-platform-y := vfio_platform.o
>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform-base.o
>  obj-$(CONFIG_VFIO_PLATFORM) += reset/
> +obj-$(CONFIG_VFIO_PLATFORM) += msi/
>  
>  vfio-amba-y := vfio_amba.o
>  
> diff --git a/drivers/vfio/platform/msi/Kconfig 
> b/drivers/vfio/platform/msi/Kconfig
> new file mode 100644
> index ..54d6b70e1e32
> --- /dev/null
> +++ b/drivers/vfio/platform/msi/Kconfig
> @@ -0,0 +1,9 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +config VFIO_PLATFORM_BCMPLT_MSI
> + tristate "MSI support for Broadcom platform devices"
> + depends on VFIO_PLATFORM && (ARCH_BCM_IPROC || COMPILE_TEST)
> + default ARCH_BCM_IPROC
> + help
> +   Enables the VFIO platform driver to handle msi for Broadcom devices
> +
> +   If you don't know what to do here, say N.
> diff --git a/drivers/vfio/platform/msi/Makefile 
> b/drivers/vfio/platform/msi/Makefile
> new file mode 100644
> index ..27422d45cecb
> --- /dev/null
> +++ b/drivers/vfio/platform/msi/Makefile
> @@ -0,0 +1,2 @@
> +# SPDX-License-Identifier: GPL-2.0
> +obj-$(CONFIG_VFIO_PLATFORM_BCMPLT_MSI) += vfio_platform_bcmplt.o
> diff --git a/drivers/vfio/platform/msi/vfio_platform_bcmplt.c 
> b/drivers/vfio/platform/msi/vfio_platform_bcmplt.c
> new file mode 100644
> index ..a074b5e92d77
> --- /dev/null
> +++ b/drivers/vfio/platform/msi/vfio_platform_bcmplt.c
> @@ -0,0 +1,49 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2020 Broadcom.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "../vfio_platform_private.h"
> +
> +#define RING_SIZE(64 << 10)
> +
> +#define RING_MSI_ADDR_LS 0x03c
> +#define RING_MSI_ADDR_MS 0x040
> +#define RING_MSI_DATA_VALUE  0x064
Those 3 defines would not be needed anymore with that implementation option.
> +
> +static u32 bcm_num_msi(struct vfio_platform_device *vdev)
> +{
> + struct vfio_platform_region *reg = >regions[0];
> +
> + return (reg->size / RING_SIZE);
> +}
> +
> +static struct vfio_platform_msi_node vfio_platform_bcmflexrm_msi_node = {
> + .owner = THIS_MODULE,
> + .compat = "brcm,iproc-flexrm-mbox",
> + .of_get_msi = bcm_num_msi,
> +};
> +
> +static int __init vfio_platform_bcmflexrm_msi_module_init(void)
> +{
> + __vfio_platform_register_msi(_platform_bcmflexrm_msi_node);
> +
> + return 0;
> +}
> +
> +static void __exit vfio_platform_bcmflexrm_msi_module_exit(void)
> +{
> + vfio_platform_unregister_msi("brcm,iproc-flexrm-mbox");
> +}
> +
> +module_init(vfio_platform_bcmflexrm_msi_module_init);
> +module_exit(vfio_platform_bcmflexrm_msi_module_exit);
One thing I would like to discuss with Alex.

As the reset module is mandated (except if reset_required is forced to
0), I am wondering if we shouldn't try to turn the reset module into a
"specialization" module and put the msi hooks there. I am afraid we may
end up having modules for each and every vfio platform feature
specialization. At the moment that's fully bearable but I can't predict
what's next.

As the mandated feature is the reset capability maybe we could just keep
the config/module name terminology, tune the kconfig help message to
mention the msi support in case of flex-rm?

What do you think?

Thanks

Eric




> +
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Broadcom");
> 



Re: [RFC v3 1/2] vfio/platform: add support for msi

2021-01-12 Thread Auger Eric
Hi Vikas,

On 1/5/21 6:53 AM, Vikas Gupta wrote:
> On Tue, Dec 22, 2020 at 10:57 PM Auger Eric  wrote:
>>
>> Hi Vikas,
>>
>> On 12/14/20 6:45 PM, Vikas Gupta wrote:
>>> MSI support for platform devices.The MSI block
>>> is added as an extended IRQ which exports caps
>>> VFIO_IRQ_INFO_CAP_TYPE and VFIO_IRQ_INFO_CAP_MSI_DESCS.
>>>
>>> Signed-off-by: Vikas Gupta 
>>> ---
>>>  drivers/vfio/platform/vfio_platform_common.c  | 179 +++-
>>>  drivers/vfio/platform/vfio_platform_irq.c | 260 +-
>>>  drivers/vfio/platform/vfio_platform_private.h |  32 +++
>>>  include/uapi/linux/vfio.h |  44 +++
>>>  4 files changed, 496 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/drivers/vfio/platform/vfio_platform_common.c 
>>> b/drivers/vfio/platform/vfio_platform_common.c
>>> index fb4b385191f2..c936852f35d7 100644
>>> --- a/drivers/vfio/platform/vfio_platform_common.c
>>> +++ b/drivers/vfio/platform/vfio_platform_common.c
>>> @@ -16,6 +16,7 @@
>>>  #include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>
>>>  #include "vfio_platform_private.h"
>>>
>>> @@ -26,6 +27,8 @@
>>>  #define VFIO_PLATFORM_IS_ACPI(vdev) ((vdev)->acpihid != NULL)
>>>
>>>  static LIST_HEAD(reset_list);
>>> +/* devices having MSI support */
>> nit: for devices using MSIs?
>>> +static LIST_HEAD(msi_list);
>>>  static DEFINE_MUTEX(driver_lock);
>>>
>>>  static vfio_platform_reset_fn_t vfio_platform_lookup_reset(const char 
>>> *compat,
>>> @@ -47,6 +50,25 @@ static vfio_platform_reset_fn_t 
>>> vfio_platform_lookup_reset(const char *compat,
>>>   return reset_fn;
>>>  }
>>>
>>> +static bool vfio_platform_lookup_msi(struct vfio_platform_device *vdev)
>>> +{
>>> + bool has_msi = false;
>>> + struct vfio_platform_msi_node *iter;
>>> +
>>> + mutex_lock(_lock);
>>> + list_for_each_entry(iter, _list, link) {
>>> + if (!strcmp(iter->compat, vdev->compat) &&
>>> + try_module_get(iter->owner)) {
>>> + vdev->msi_module = iter->owner;
>>> + vdev->of_get_msi = iter->of_get_msi;
>>> + has_msi = true;
>>> + break;
>>> + }
>>> + }
>>> + mutex_unlock(_lock);
>>> + return has_msi;
>>> +}
>>> +
>>>  static int vfio_platform_acpi_probe(struct vfio_platform_device *vdev,
>>>   struct device *dev)
>>>  {
>>> @@ -126,6 +148,19 @@ static int vfio_platform_get_reset(struct 
>>> vfio_platform_device *vdev)
>>>   return vdev->of_reset ? 0 : -ENOENT;
>>>  }
>>>
>>> +static int vfio_platform_get_msi(struct vfio_platform_device *vdev)
>>> +{
>>> + bool has_msi;
>>> +
>>> + has_msi = vfio_platform_lookup_msi(vdev);
>>> + if (!has_msi) {
>>> + request_module("vfio-msi:%s", vdev->compat);
>>> + has_msi = vfio_platform_lookup_msi(vdev);
>>> + }
>>> +
>>> + return has_msi ? 0 : -ENOENT;
>>> +}
>>> +
>>>  static void vfio_platform_put_reset(struct vfio_platform_device *vdev)
>>>  {
>>>   if (VFIO_PLATFORM_IS_ACPI(vdev))
>>> @@ -135,6 +170,12 @@ static void vfio_platform_put_reset(struct 
>>> vfio_platform_device *vdev)
>>>   module_put(vdev->reset_module);
>>>  }
>>>
>>> +static void vfio_platform_put_msi(struct vfio_platform_device *vdev)
>>> +{
>>> + if (vdev->of_get_msi)
>>> + module_put(vdev->msi_module);
>>> +}
>>> +
>>>  static int vfio_platform_regions_init(struct vfio_platform_device *vdev)
>>>  {
>>>   int cnt = 0, i;
>>> @@ -343,9 +384,17 @@ static long vfio_platform_ioctl(void *device_data,
>>>
>>>   } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
>>>   struct vfio_irq_info info;
>>> + struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
>>> + struct vfio_irq_info_cap_msi *msi_info = NULL;
>>> + int ext_irq_index = vdev->num_irqs - vdev->num_ext_irqs;
>>> + unsigned 

Re: [PATCH net-next 0/5] skbuff: introduce skbuff_heads bulking and reusing

2021-01-12 Thread Eric Dumazet
On Mon, Jan 11, 2021 at 7:27 PM Alexander Lobakin  wrote:
>
> Inspired by cpu_map_kthread_run() and _kfree_skb_defer() logics.
>
> Currently, all sorts of skb allocation always do allocate
> skbuff_heads one by one via kmem_cache_alloc().
> On the other hand, we have percpu napi_alloc_cache to store
> skbuff_heads queued up for freeing and flush them by bulks.
>
> We can use this struct to cache and bulk not only freeing, but also
> allocation of new skbuff_heads, as well as to reuse cached-to-free
> heads instead of allocating the new ones.
> As accessing napi_alloc_cache implies NAPI softirq context, do this
> only for __napi_alloc_skb() and its derivatives (napi_alloc_skb()
> and napi_get_frags()). The rough amount of their call sites are 69,
> which is quite a number.
>
> iperf3 showed a nice bump from 910 to 935 Mbits while performing
> UDP VLAN NAT on 1.2 GHz MIPS board. The boost is likely to be
> way bigger on more powerful hosts and NICs with tens of Mpps.

What is the latency cost of these bulk allocations, and for TCP traffic
on which GRO is the norm ?

Adding caches is increasing cache foot print when the cache is populated.

I wonder if your iperf3 numbers are simply wrong because of lack of
GRO in this UDP VLAN NAT case.

We are adding a log of additional code, thus icache pressure, that
iperf3 tests can not really measure.

Most linus devices simply handle one packet at a time (one packet per interrupt)


Re: [PATCH 2/2] scsi: ufs: Remove unnecessary devm_kfree

2021-01-11 Thread Eric Biggers
On Mon, Jan 11, 2021 at 11:32:02PM +0100, Bean Huo wrote:
> From: Bean Huo 
> 
> The memory allocated with devm_kzalloc() is freed automatically
> no need to explicitly call devm_kfree.
> 
> Signed-off-by: Bean Huo 
> ---
>  drivers/scsi/ufs/ufshcd-crypto.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd-crypto.c 
> b/drivers/scsi/ufs/ufshcd-crypto.c
> index 07310b12a5dc..ec80ec83cf85 100644
> --- a/drivers/scsi/ufs/ufshcd-crypto.c
> +++ b/drivers/scsi/ufs/ufshcd-crypto.c
> @@ -182,7 +182,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba 
> *hba)
>   err = blk_ksm_init(>ksm,
>  hba->crypto_capabilities.config_count + 1);
>   if (err)
> - goto out_free_caps;
> + goto out;
>  
>   hba->ksm.ksm_ll_ops = ufshcd_ksm_ops;
>   /* UFS only supports 8 bytes for any DUN */
> @@ -208,8 +208,6 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba 
> *hba)
>  
>   return 0;
>  
> -out_free_caps:
> - devm_kfree(hba->dev, hba->crypto_cap_array);
>  out:
>   /* Indicate that init failed by clearing UFSHCD_CAP_CRYPTO */
>   hba->caps &= ~UFSHCD_CAP_CRYPTO;

Looks fine, feel free to add:

Reviewed-by: Eric Biggers 

I think this was here to free the memory in the case where the crypto support
gets disabled but the UFS host initialization still continues, so that the space
wouldn't be wasted.  But that's not what happens, as this is only reached on
ENOMEM which is a fatal error.

- Eric


Re: [PATCH v2 01/10] vfs: move cap_convert_nscap() call into vfs_setxattr()

2021-01-11 Thread Eric W. Biederman
Miklos Szeredi  writes:

> On Fri, Jan 01, 2021 at 11:35:16AM -0600, Eric W. Biederman wrote:
>> Miklos Szeredi  writes:
>> 
>> > cap_convert_nscap() does permission checking as well as conversion of the
>> > xattr value conditionally based on fs's user-ns.
>> >
>> > This is needed by overlayfs and probably other layered fs (ecryptfs) and is
>> > what vfs_foo() is supposed to do anyway.
>> 
>> Well crap.
>> 
>> I just noticed this and it turns out this change is wrong.
>> 
>> The problem is that it reads the rootid from the v3 fscap, using
>> current_user_ns() and then writes it using the sb->s_user_ns.
>> 
>> So any time the stacked filesystems sb->s_user_ns do not match or
>> current_user_ns does not match sb->s_user_ns this could be a problem.
>> 
>> In a stacked filesystem a second pass through vfs_setxattr will result
>> in the rootid being translated a second time (with potentially the wrong
>> namespaces).  I think because of the security checks a we won't write
>> something we shouldn't be able to write to the filesystem.  Still we
>> will be writing the wrong v3 fscap which can go quite badly.
>> 
>> This doesn't look terribly difficult to fix.
>> 
>> Probably convert this into a fs independent form using uids in
>> init_user_ns at input and have cap_convert_nscap convert the v3 fscap
>> into the filesystem dependent form.  With some way for stackable
>> filesystems to just skip converting it from the filesystem independent
>> format.
>> 
>> Uids in xattrs that are expected to go directly to disk, but aren't
>> always suitable for going directly to disk are tricky.
>
> I've been looking at this for a couple of days and can't say I clearly
> understand everything yet.
>
> For one: a v2 fscap is supposed to be equivalent to a v3 fscap with a rootid 
> of
> zero, right?

Yes.  This assumes that everything is translated into the uids of the
target filesystem.

> If so, why does cap_inode_getsecurity() treat them differently (v2 fscap
> succeeding unconditionally while v3 one being either converted to v2, rejected
> or left as v3 depending on current_user_ns())?

As I understand it v2 fscaps have always succeeded unconditionally.  The
only case I can see for a v2 fscap might not succeed when read is if the
filesystem is outside of the initial user namespace.


> Anyway, here's a patch that I think fixes getxattr() layering for
> security.capability.  Does basically what you suggested.  Slight change of
> semantics vs. v1 caps, not sure if that is still needed, getxattr()/setxattr()
> hasn't worked for these since the introduction of v3 in 4.14.
> Untested.

Taking a look.  The goal of change how these operate is to make it so
that layered filesystems can just pass through the data if they don't
want to change anything (even with the user namespaces of the
filesystems in question are different).

Feedback on the code below:
- cap_get should be in inode_operations like get_acl and set_acl.

- cap_get should return a cpu_vfs_cap_data.

  Which means that only make_kuid is needed when reading the cap from
  disk.

  Which means that except for the rootid_owns_currentns check (which
  needs to happen elsewhere) default_cap_get should be today's
  get_vfs_cap_from_disk.

- With the introduction of cap_get I believe commoncap should stop
  implementing the security_inode_getsecurity hook, and rather have
  getxattr observe is the file capability xatter and call the new
  vfs_cap_get then translate to a v2 or v3 cap as appropriate when
  returning the cap to userspace.

I think that would put the code on a solid comprehensible foundation.

Eric


> I still need to wrap my head around the permission requirements for the
> setxattr() case...
>
> Thanks,
> Miklos
>
> ---
>  fs/overlayfs/super.c   |   15 +++
>  include/linux/capability.h |2 
>  include/linux/fs.h |1 
>  security/commoncap.c   |  210 
> -
>  4 files changed, 132 insertions(+), 96 deletions(-)
>
> --- a/fs/overlayfs/super.c
> +++ b/fs/overlayfs/super.c
> @@ -395,6 +395,20 @@ static int ovl_remount(struct super_bloc
>   return ret;
>  }
>  
> +static int ovl_cap_get(struct dentry *dentry,
> +struct vfs_ns_cap_data *nscap)
> +{
> + int res;
> + const struct cred *old_cred;
> + struct dentry *realdentry = ovl_dentry_real(dentry);
> +
> + old_cred = ovl_override_creds(dentry->d_sb);
> + res = vfs_cap_get(realdentry, nscap);
> + revert_creds(old_cred);
> +
> + return res;
> +}
> +
>  static const struct super_operations ovl_super_operations = {
>   .a

Re: [RFC PATCH v2 0/8] Count rlimits in each user namespace

2021-01-11 Thread Eric W. Biederman
Linus Torvalds  writes:

> On Sun, Jan 10, 2021 at 9:34 AM Alexey Gladkov  
> wrote:
>>
>> To address the problem, we bind rlimit counters to each user namespace. The
>> result is a tree of rlimit counters with the biggest value at the root (aka
>> init_user_ns). The rlimit counter increment/decrement occurs in the current 
>> and
>> all parent user namespaces.
>
> I'm not seeing why this is necessary.
>
> Maybe it's the right approach, but none of the patches (or this cover
> letter email) really explain it to me.
>
> I understand why you might want the _limits_ themselves would form a
> tree like this - with the "master limit" limiting the limits in the
> user namespaces under it.
>
> But I don't understand why the _counts_ should do that. The 'struct
> user_struct' should be shared across even user namespaces for the same
> user.
>
> IOW, the very example of the problem you quote seems to argue against this:
>
>> For example, there are two containers (A and B) created by one user. The
>> container A sets RLIMIT_NPROC=1 and starts one process. Everything is fine, 
>> but
>> when container B tries to do the same it will fail because the number of
>> processes is counted globally for each user and user has one process already.
>
> Note how the problem was _not_ that the _count_ was global. That part
> was fine and all good.

The problem is fundamentally that the per process RLIMIT_NPROC was
compared against the user_struct->processes.

I have only heard the problem described but I believe it is either the
RLIMIT_NPROC test in fork or at the beginning of do_execveat_common that
is failing.

>From fs/exec.c line 1866:
>   /*
>* We move the actual failure in case of RLIMIT_NPROC excess from
>* set*uid() to execve() because too many poorly written programs
>* don't check setuid() return code.  Here we additionally recheck
>* whether NPROC limit is still exceeded.
>*/
>   if ((current->flags & PF_NPROC_EXCEEDED) &&
>   atomic_read(_user()->processes) > rlimit(RLIMIT_NPROC)) {
>   retval = -EAGAIN;
>   goto out_ret;
>   }

>From fs/fork.c line 1966:
>   retval = -EAGAIN;
>   if (atomic_read(>real_cred->user->processes) >=
>   task_rlimit(p, RLIMIT_NPROC)) {
>   if (p->real_cred->user != INIT_USER &&
>   !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
>   goto bad_fork_free;
>   }
>   current->flags &= ~PF_NPROC_EXCEEDED;

In both the cases the RLIMIT_NPROC value comes from
task->signal->rlim[RLIMIT_NPROC] and the count of processes
comes from task->cred->user->processes.

> No, the problem was that the _limit_ in container A also ended up
> affecting container B.

The description I have is that both containers run the same service
that set it's RLIMIT_NPROC to 1 in both containers.

> So to me, that says that it would make sense to continue to use the
> resource counts in 'struct user_struct' (because if user A has a hard
> limit of X, then creating a new namespace shouldn't expand that
> limit), but then have the ability to make per-container changes to the
> resource limits (as long as they are within the bounds of the parent
> user namespace resource limit).

I agree that needs to work as well.

> Maybe there is some reason for this ucounts approach, but if so, I
> feel it was not explained at all.

Let me see if I can starte the example a litle more clearly.

Suppose there is a service never_fork that sets RLIMIT_NPROC runs as
never_fork_user and sets RLIMIT_NPROC to 1 in it's systemd service file.

Further suppose there is a user bob who has two containers he wants to
run: container1 and container2.  Both containers start the never_fork
service.

Bob first starts container1 and inside it the never_fork service starts.
Bob starts container2 and the never_fork service fails to start.

Does that make it clear that it is the count of the processes that would
exceed 1 if both instances of the never_fork service starts that would
be the problem?

Eric


Re: Malicious fs images was Re: ext4 regression in v5.9-rc2 from e7bfb5c9bb3d on ro fs with overlapped bitmaps

2021-01-11 Thread Eric Biggers
On Mon, Jan 11, 2021 at 10:51:20AM -0800, Darrick J. Wong wrote:
> On Sun, Jan 10, 2021 at 07:41:02PM +0100, Pavel Machek wrote:
> > Hi!
> > 
> > On Fri 2020-10-09 10:37:32, Theodore Y. Ts'o wrote:
> > > On Thu, Oct 08, 2020 at 03:22:59PM -0700, Josh Triplett wrote:
> > > > 
> > > > I wasn't trying to make a *new* general principle or policy. I was under
> > > > the impression that this *was* the policy, because it never occurred to
> > > > me that it could be otherwise. It seemed like a natural aspect of the
> > > > kernel/userspace boundary, to the point that the idea of it *not* being
> > > > part of the kernel's stability guarantees didn't cross my mind. 
> > > 
> > > >From our perspective (and Darrick and I discussed this on this week's
> > > ext4 video conference, so it represents the ext4 and xfs maintainer's
> > > position) is that the file system format is different.  First, the
> > > on-disk format is not an ABI, and it is several orders more complex
> > > than a system call interface.  Second, we make no guarantees about
> > > what the file system created by malicious tools will do.  For example,
> > > XFS developers reject bug reports from file system fuzzers, because
> 
> My recollection of this is quite different -- sybot was sending multiple
> zeroday exploits per week to the public xfs list, and nobody at Google
> was helping us to /fix/ those bugs.Each report took hours of developer
> time to extract the malicious image (because Dmitry couldn't figure out
> how to add that ability to syzbot) and syscall sequence from the
> reproducer program, plus whatever time it took to craft a patch, test
> it, and push it through review.
> 
> Dave and I complained to Dmitry about how the community had zero input
> into the rate at which syzbot was allowed to look for xfs bugs.  Nobody
> at Google would commit to helping fix any of the XFS bugs, and Dmitry
> would not give us better choices than (a) Google AI continuing to create
> zerodays and leaving the community to clean up the mess, or (b) shutting
> off syzbot entirely.  At the time I said I would accept letting syzbot
> run against xfs until it finds something, and turning it off until we
> resolve the issue.  That wasn't acceptable, because (I guess) nobody at
> Google wants to put /any/ staff time into XFS at all.
> 
> TLDR: XFS /does/ accept bug reports about fuzzed and broken images.
> What we don't want is make-work Google AIs spraying zeroday code in
> public places and the community developers having to clean up the mess.

syzkaller is an open source project that implements a coverage-guided fuzzer for
multiple operating system kernels; it's not "Google AI".

Anyone can run syzkaller (either by itself, or as part of a syzbot instance) and
find the same bugs.

- Eric


Re: Re: [PATCH] evm: Fix memleak in init_desc

2021-01-09 Thread Eric Biggers
On Sun, Jan 10, 2021 at 01:27:09PM +0800, dinghao@zju.edu.cn wrote:
> > On Sat, Jan 09, 2021 at 07:33:05PM +0800, Dinghao Liu wrote:
> > > When kmalloc() fails, tmp_tfm allocated by
> > > crypto_alloc_shash() has not been freed, which
> > > leads to memleak.
> > > 
> > > Fixes: d46eb3699502b ("evm: crypto hash replaced by shash")
> > > Signed-off-by: Dinghao Liu 
> > > ---
> > >  security/integrity/evm/evm_crypto.c | 9 +++--
> > >  1 file changed, 7 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/security/integrity/evm/evm_crypto.c 
> > > b/security/integrity/evm/evm_crypto.c
> > > index 168c3b78ac47..39fb31a638ac 100644
> > > --- a/security/integrity/evm/evm_crypto.c
> > > +++ b/security/integrity/evm/evm_crypto.c
> > > @@ -73,7 +73,7 @@ static struct shash_desc *init_desc(char type, uint8_t 
> > > hash_algo)
> > >  {
> > >   long rc;
> > >   const char *algo;
> > > - struct crypto_shash **tfm, *tmp_tfm;
> > > + struct crypto_shash **tfm, *tmp_tfm = NULL;
> > >   struct shash_desc *desc;
> > >  
> > >   if (type == EVM_XATTR_HMAC) {
> > > @@ -118,13 +118,18 @@ static struct shash_desc *init_desc(char type, 
> > > uint8_t hash_algo)
> > >  alloc:
> > >   desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(*tfm),
> > >   GFP_KERNEL);
> > > - if (!desc)
> > > + if (!desc) {
> > > + if (tmp_tfm)
> > > + crypto_free_shash(tmp_tfm);
> > >   return ERR_PTR(-ENOMEM);
> > > + }
> > >  
> > >   desc->tfm = *tfm;
> > >  
> > >   rc = crypto_shash_init(desc);
> > >   if (rc) {
> > > + if (tmp_tfm)
> > > +     crypto_free_shash(tmp_tfm);
> > >   kfree(desc);
> > >   return ERR_PTR(rc);
> > >   }
> > 
> > There's no need to check for NULL before calling crypto_free_shash().
> > 
> 
> I find there is a crypto_shash_tfm() in the definition of 
> crypto_free_shash(). Will this lead to null pointer dereference
> when we use it to free a NULL pointer?
> 

No.  It does >base, not tfm->base.

- Eric


Re: [PATCH] x86/vm86/32: Remove VM86_SCREEN_BITMAP support

2021-01-09 Thread Eric W. Biederman
Andy Lutomirski  writes:

> The implementation was rather buggy.  It unconditionally marked PTEs
> read-only, even for VM_SHARED mappings.  I'm not sure whether this is
> actually a problem, but it certainly seems unwise.  More importantly, it
> released the mmap lock before flushing the TLB, which could allow a racing
> CoW operation to falsely believe that the underlying memory was not
> writable.
>
> I can't find any users at all of this mechanism, so just remove it.

In another age this was used by dosemu.  Have you looked at dosemu to
see if it still uses this support (on 32bit where dosemu can use vm86)?

It may still be a valid removal target I just wanted to point out what
the original user was.

Eric

> Cc: Andrea Arcangeli 
> Cc: Linux-MM 
> Cc: Jason Gunthorpe 
> Cc: x...@kernel.org
> Cc: Linus Torvalds 
> Cc: Matthew Wilcox 
> Cc: Jann Horn 
> Cc: Jan Kara 
> Cc: Yu Zhao 
> Cc: Peter Xu 
> Signed-off-by: Andy Lutomirski 
> ---
>  arch/x86/include/uapi/asm/vm86.h |  2 +-
>  arch/x86/kernel/vm86_32.c| 55 ++--
>  2 files changed, 10 insertions(+), 47 deletions(-)
>
> diff --git a/arch/x86/include/uapi/asm/vm86.h 
> b/arch/x86/include/uapi/asm/vm86.h
> index d2ee4e307ef8..50004fb4590d 100644
> --- a/arch/x86/include/uapi/asm/vm86.h
> +++ b/arch/x86/include/uapi/asm/vm86.h
> @@ -106,7 +106,7 @@ struct vm86_struct {
>  /*
>   * flags masks
>   */
> -#define VM86_SCREEN_BITMAP   0x0001
> +#define VM86_SCREEN_BITMAP   0x0001/* no longer supported */
>  
>  struct vm86plus_info_struct {
>   unsigned long force_return_for_pic:1;
> diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
> index 764573de3996..28b9e8d511e1 100644
> --- a/arch/x86/kernel/vm86_32.c
> +++ b/arch/x86/kernel/vm86_32.c
> @@ -160,49 +160,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int 
> retval)
>   do_exit(SIGSEGV);
>  }
>  
> -static void mark_screen_rdonly(struct mm_struct *mm)
> -{
> - struct vm_area_struct *vma;
> - spinlock_t *ptl;
> - pgd_t *pgd;
> - p4d_t *p4d;
> - pud_t *pud;
> - pmd_t *pmd;
> - pte_t *pte;
> - int i;
> -
> - mmap_write_lock(mm);
> - pgd = pgd_offset(mm, 0xA);
> - if (pgd_none_or_clear_bad(pgd))
> - goto out;
> - p4d = p4d_offset(pgd, 0xA);
> - if (p4d_none_or_clear_bad(p4d))
> - goto out;
> - pud = pud_offset(p4d, 0xA);
> - if (pud_none_or_clear_bad(pud))
> - goto out;
> - pmd = pmd_offset(pud, 0xA);
> -
> - if (pmd_trans_huge(*pmd)) {
> - vma = find_vma(mm, 0xA);
> - split_huge_pmd(vma, pmd, 0xA);
> - }
> - if (pmd_none_or_clear_bad(pmd))
> - goto out;
> - pte = pte_offset_map_lock(mm, pmd, 0xA, );
> - for (i = 0; i < 32; i++) {
> - if (pte_present(*pte))
> - set_pte(pte, pte_wrprotect(*pte));
> - pte++;
> - }
> - pte_unmap_unlock(pte, ptl);
> -out:
> - mmap_write_unlock(mm);
> - flush_tlb_mm_range(mm, 0xA, 0xA + 32*PAGE_SIZE, PAGE_SHIFT, 
> false);
> -}
> -
> -
> -
>  static int do_vm86_irq_handling(int subfunction, int irqnumber);
>  static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
>  
> @@ -282,6 +239,15 @@ static long do_sys_vm86(struct vm86plus_struct __user 
> *user_vm86, bool plus)
>   offsetof(struct vm86_struct, int_revectored)))
>   return -EFAULT;
>  
> +
> + /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */
> + if (v.flags & VM86_SCREEN_BITMAP) {
> + char comm[TASK_COMM_LEN];
> +
> + pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no 
> longer supported\n", get_task_comm(comm, current);
> + return -EINVAL;
> + }
> +
>   memset(, 0, sizeof(vm86regs));
>  
>   vm86regs.pt.bx = v.regs.ebx;
> @@ -370,9 +336,6 @@ static long do_sys_vm86(struct vm86plus_struct __user 
> *user_vm86, bool plus)
>   update_task_stack(tsk);
>   preempt_enable();
>  
> - if (vm86->flags & VM86_SCREEN_BITMAP)
> - mark_screen_rdonly(tsk->mm);
> -
>   memcpy((struct kernel_vm86_regs *)regs, , sizeof(vm86regs));
>   return regs->ax;
>  }


<    1   2   3   4   5   6   7   8   9   10   >