Re: [PATCH net] tcp: make TCP_USER_TIMEOUT accurate for zero window probes

2021-01-22 Thread Eric Dumazet
On Fri, Jan 22, 2021 at 8:13 PM Enke Chen  wrote:
>
> From: Enke Chen 
>
> The TCP_USER_TIMEOUT is checked by the 0-window probe timer. As the
> timer has backoff with a max interval of about two minutes, the
> actual timeout for TCP_USER_TIMEOUT can be off by up to two minutes.
>
> In this patch the TCP_USER_TIMEOUT is made more accurate by taking it
> into account when computing the timer value for the 0-window probes.
>
> This patch is similar to the one that made TCP_USER_TIMEOUT accurate for
> RTOs in commit b701a99e431d ("tcp: Add tcp_clamp_rto_to_user_timeout()
> helper to improve accuracy").
>
> Signed-off-by: Enke Chen 
> Reviewed-by: Neal Cardwell 
> ---

SGTM, thanks !

Signed-off-by: Eric Dumazet 


[PATCH v5 4/4] integrity: Load mokx variables into the blacklist keyring

2021-01-22 Thread Eric Snowberg
During boot the Secure Boot Forbidden Signature Database, dbx,
is loaded into the blacklist keyring.  Systems booted with shim
have an equivalent Forbidden Signature Database called mokx.
Currently mokx is only used by shim and grub, the contents are
ignored by the kernel.

Add the ability to load mokx into the blacklist keyring during boot.

Signed-off-by: Eric Snowberg 
Suggested-by: James Bottomley 
---
 security/integrity/platform_certs/load_uefi.c | 20 +--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/security/integrity/platform_certs/load_uefi.c 
b/security/integrity/platform_certs/load_uefi.c
index ee4b4c666854..f290f78c3f30 100644
--- a/security/integrity/platform_certs/load_uefi.c
+++ b/security/integrity/platform_certs/load_uefi.c
@@ -132,8 +132,9 @@ static int __init load_moklist_certs(void)
 static int __init load_uefi_certs(void)
 {
efi_guid_t secure_var = EFI_IMAGE_SECURITY_DATABASE_GUID;
-   void *db = NULL, *dbx = NULL;
-   unsigned long dbsize = 0, dbxsize = 0;
+   efi_guid_t mok_var = EFI_SHIM_LOCK_GUID;
+   void *db = NULL, *dbx = NULL, *mokx = NULL;
+   unsigned long dbsize = 0, dbxsize = 0, mokxsize = 0;
efi_status_t status;
int rc = 0;
 
@@ -175,6 +176,21 @@ static int __init load_uefi_certs(void)
kfree(dbx);
}
 
+   mokx = get_cert_list(L"MokListXRT", &mok_var, &mokxsize, &status);
+   if (!mokx) {
+   if (status == EFI_NOT_FOUND)
+   pr_debug("mokx variable wasn't found\n");
+   else
+   pr_info("Couldn't get mokx list\n");
+   } else {
+   rc = parse_efi_signature_list("UEFI:MokListXRT",
+ mokx, mokxsize,
+ get_handler_for_dbx);
+   if (rc)
+   pr_err("Couldn't parse mokx signatures %d\n", rc);
+   kfree(mokx);
+   }
+
/* Load the MokListRT certs */
rc = load_moklist_certs();
 
-- 
2.18.4



[PATCH v5 1/4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-22 Thread Eric Snowberg
This fixes CVE-2020-26541.

The Secure Boot Forbidden Signature Database, dbx, contains a list of now
revoked signatures and keys previously approved to boot with UEFI Secure
Boot enabled.  The dbx is capable of containing any number of
EFI_CERT_X509_SHA256_GUID, EFI_CERT_SHA256_GUID, and EFI_CERT_X509_GUID
entries.

Currently when EFI_CERT_X509_GUID are contained in the dbx, the entries are
skipped.

Add support for EFI_CERT_X509_GUID dbx entries. When a EFI_CERT_X509_GUID
is found, it is added as an asymmetrical key to the .blacklist keyring.
Anytime the .platform keyring is used, the keys in the .blacklist keyring
are referenced, if a matching key is found, the key will be rejected.

Signed-off-by: Eric Snowberg 
Reviewed-by: Jarkko Sakkinen 
Signed-off-by: David Howells 
---
v5: Function name changes done by David Howells
---
 certs/blacklist.c | 32 +++
 certs/blacklist.h | 12 +++
 certs/system_keyring.c|  6 
 include/keys/system_keyring.h | 11 +++
 .../platform_certs/keyring_handler.c  | 11 +++
 5 files changed, 72 insertions(+)

diff --git a/certs/blacklist.c b/certs/blacklist.c
index 6514f9ebc943..a7f021878a4b 100644
--- a/certs/blacklist.c
+++ b/certs/blacklist.c
@@ -100,6 +100,38 @@ int mark_hash_blacklisted(const char *hash)
return 0;
 }
 
+int add_key_to_revocation_list(const char *data, size_t size)
+{
+   key_ref_t key;
+
+   key = key_create_or_update(make_key_ref(blacklist_keyring, true),
+  "asymmetric",
+  NULL,
+  data,
+  size,
+  ((KEY_POS_ALL & ~KEY_POS_SETATTR) | 
KEY_USR_VIEW),
+  KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN);
+
+   if (IS_ERR(key)) {
+   pr_err("Problem with revocation key (%ld)\n", PTR_ERR(key));
+   return PTR_ERR(key);
+   }
+
+   return 0;
+}
+
+int is_key_on_revocation_list(struct pkcs7_message *pkcs7)
+{
+   int ret;
+
+   ret = validate_trust(pkcs7, blacklist_keyring);
+
+   if (ret == 0)
+   return -EKEYREJECTED;
+
+   return -ENOKEY;
+}
+
 /**
  * is_hash_blacklisted - Determine if a hash is blacklisted
  * @hash: The hash to be checked as a binary blob
diff --git a/certs/blacklist.h b/certs/blacklist.h
index 1efd6fa0dc60..420bb7c86e07 100644
--- a/certs/blacklist.h
+++ b/certs/blacklist.h
@@ -1,3 +1,15 @@
 #include 
+#include 
+#include 
 
 extern const char __initconst *const blacklist_hashes[];
+
+#ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING
+#define validate_trust pkcs7_validate_trust
+#else
+static inline int validate_trust(struct pkcs7_message *pkcs7,
+struct key *trust_keyring)
+{
+   return -ENOKEY;
+}
+#endif
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index 798291177186..cc165b359ea3 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -241,6 +241,12 @@ int verify_pkcs7_message_sig(const void *data, size_t len,
pr_devel("PKCS#7 platform keyring is not available\n");
goto error;
}
+
+   ret = is_key_on_revocation_list(pkcs7);
+   if (ret != -ENOKEY) {
+   pr_devel("PKCS#7 platform key is on revocation list\n");
+   goto error;
+   }
}
ret = pkcs7_validate_trust(pkcs7, trusted_keys);
if (ret < 0) {
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
index fb8b07daa9d1..61f98739e8b1 100644
--- a/include/keys/system_keyring.h
+++ b/include/keys/system_keyring.h
@@ -31,11 +31,14 @@ extern int restrict_link_by_builtin_and_secondary_trusted(
 #define restrict_link_by_builtin_and_secondary_trusted 
restrict_link_by_builtin_trusted
 #endif
 
+extern struct pkcs7_message *pkcs7;
 #ifdef CONFIG_SYSTEM_BLACKLIST_KEYRING
 extern int mark_hash_blacklisted(const char *hash);
+extern int add_key_to_revocation_list(const char *data, size_t size);
 extern int is_hash_blacklisted(const u8 *hash, size_t hash_len,
   const char *type);
 extern int is_binary_blacklisted(const u8 *hash, size_t hash_len);
+extern int is_key_on_revocation_list(struct pkcs7_message *pkcs7);
 #else
 static inline int is_hash_blacklisted(const u8 *hash, size_t hash_len,
  const char *type)
@@ -47,6 +50,14 @@ static inline int is_binary_blacklisted(const u8 *hash, 
size_t hash_len)
 {
return 0;
 }
+static inline int add_key_to_revocation_list(const char *data, size_t size)
+{
+   return 0;
+}
+static inline int is_key_on_revocation_list(struct pkcs7_message *pkcs7)
+{
+   return -ENOKEY;
+}
 #endif
 
 #ifdef CONFIG_IMA_BL

[PATCH v5 0/4] Add EFI_CERT_X509_GUID support for dbx/mokx entries

2021-01-22 Thread Eric Snowberg
This is the fifth patch series for adding support for 
EFI_CERT_X509_GUID entries [1].  It has been expanded to not only include
dbx entries but also entries in the mokx.  Additionally my series to
preload these certificate [2] has also been included.

This series is based on v5.11-rc4.

[1] 
https://patchwork.kernel.org/project/linux-security-module/patch/20200916004927.64276-1-eric.snowb...@oracle.com/
[2] https://lore.kernel.org/patchwork/cover/1315485/

Eric Snowberg (4):
  certs: Add EFI_CERT_X509_GUID support for dbx entries
  certs: Move load_system_certificate_list to a common function
  certs: Add ability to preload revocation certs
  integrity: Load mokx variables into the blacklist keyring

 certs/Kconfig |  8 +++
 certs/Makefile| 20 ++-
 certs/blacklist.c | 49 
 certs/blacklist.h | 12 
 certs/common.c| 56 +++
 certs/common.h|  9 +++
 certs/revocation_certificates.S   | 21 +++
 certs/system_keyring.c| 55 +++---
 include/keys/system_keyring.h | 11 
 scripts/Makefile  |  1 +
 .../platform_certs/keyring_handler.c  | 11 
 security/integrity/platform_certs/load_uefi.c | 20 ++-
 12 files changed, 222 insertions(+), 51 deletions(-)
 create mode 100644 certs/common.c
 create mode 100644 certs/common.h
 create mode 100644 certs/revocation_certificates.S


base-commit: 19c329f6808995b142b3966301f217c831e7cf31
-- 
2.18.4



[PATCH v5 3/4] certs: Add ability to preload revocation certs

2021-01-22 Thread Eric Snowberg
Add a new Kconfig option called SYSTEM_REVOCATION_KEYS. If set,
this option should be the filename of a PEM-formated file containing
X.509 certificates to be included in the default blacklist keyring.

Signed-off-by: Eric Snowberg 
Acked-by: Jarkko Sakkinen 
---
 certs/Kconfig   |  8 
 certs/Makefile  | 18 --
 certs/blacklist.c   | 17 +
 certs/revocation_certificates.S | 21 +
 scripts/Makefile|  1 +
 5 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 certs/revocation_certificates.S

diff --git a/certs/Kconfig b/certs/Kconfig
index c94e93d8bccf..379a6e198459 100644
--- a/certs/Kconfig
+++ b/certs/Kconfig
@@ -83,4 +83,12 @@ config SYSTEM_BLACKLIST_HASH_LIST
  wrapper to incorporate the list into the kernel.  Each  should
  be a string of hex digits.
 
+config SYSTEM_REVOCATION_KEYS
+   string "X.509 certificates to be preloaded into the system blacklist 
keyring"
+   depends on SYSTEM_BLACKLIST_KEYRING
+   help
+ If set, this option should be the filename of a PEM-formatted file
+ containing X.509 certificates to be included in the default blacklist
+ keyring.
+
 endmenu
diff --git a/certs/Makefile b/certs/Makefile
index f4b90bad8690..e3f4926fd21e 100644
--- a/certs/Makefile
+++ b/certs/Makefile
@@ -4,7 +4,7 @@
 #
 
 obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o 
common.o
-obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o
+obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o 
revocation_certificates.o common.o
 ifneq ($(CONFIG_SYSTEM_BLACKLIST_HASH_LIST),"")
 obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist_hashes.o
 else
@@ -29,7 +29,7 @@ $(obj)/x509_certificate_list: scripts/extract-cert 
$(SYSTEM_TRUSTED_KEYS_SRCPREF
$(call 
if_changed,extract_certs,$(SYSTEM_TRUSTED_KEYS_SRCPREFIX)$(CONFIG_SYSTEM_TRUSTED_KEYS))
 endif # CONFIG_SYSTEM_TRUSTED_KEYRING
 
-clean-files := x509_certificate_list .x509.list
+clean-files := x509_certificate_list .x509.list x509_revocation_list
 
 ifeq ($(CONFIG_MODULE_SIG),y)
 ###
@@ -104,3 +104,17 @@ targets += signing_key.x509
 $(obj)/signing_key.x509: scripts/extract-cert $(X509_DEP) FORCE
$(call 
if_changed,extract_certs,$(MODULE_SIG_KEY_SRCPREFIX)$(CONFIG_MODULE_SIG_KEY))
 endif # CONFIG_MODULE_SIG
+
+ifeq ($(CONFIG_SYSTEM_BLACKLIST_KEYRING),y)
+
+$(eval $(call config_filename,SYSTEM_REVOCATION_KEYS))
+
+$(obj)/revocation_certificates.o: $(obj)/x509_revocation_list
+
+quiet_cmd_extract_certs  = EXTRACT_CERTS   $(patsubst "%",%,$(2))
+  cmd_extract_certs  = scripts/extract-cert $(2) $@
+
+targets += x509_revocation_list
+$(obj)/x509_revocation_list: scripts/extract-cert 
$(SYSTEM_REVOCATION_KEYS_SRCPREFIX)$(SYSTEM_REVOCATION_KEYS_FILENAME) FORCE
+   $(call 
if_changed,extract_certs,$(SYSTEM_REVOCATION_KEYS_SRCPREFIX)$(CONFIG_SYSTEM_REVOCATION_KEYS))
+endif
diff --git a/certs/blacklist.c b/certs/blacklist.c
index a7f021878a4b..4e8a1068adb2 100644
--- a/certs/blacklist.c
+++ b/certs/blacklist.c
@@ -16,9 +16,13 @@
 #include 
 #include 
 #include "blacklist.h"
+#include "common.h"
 
 static struct key *blacklist_keyring;
 
+extern __initconst const u8 revocation_certificate_list[];
+extern __initconst const unsigned long revocation_certificate_list_size;
+
 /*
  * The description must be a type prefix, a colon and then an even number of
  * hex digits.  The hash is kept in the description.
@@ -209,3 +213,16 @@ static int __init blacklist_init(void)
  * Must be initialised before we try and load the keys into the keyring.
  */
 device_initcall(blacklist_init);
+
+/*
+ * Load the compiled-in list of revocation X.509 certificates.
+ */
+static __init int load_revocation_certificate_list(void)
+{
+   if (revocation_certificate_list_size)
+   pr_notice("Loading compiled-in revocation X.509 
certificates\n");
+
+   return load_certificate_list(revocation_certificate_list, 
revocation_certificate_list_size,
+blacklist_keyring);
+}
+late_initcall(load_revocation_certificate_list);
diff --git a/certs/revocation_certificates.S b/certs/revocation_certificates.S
new file mode 100644
index ..f21aae8a8f0e
--- /dev/null
+++ b/certs/revocation_certificates.S
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include 
+#include 
+
+   __INITRODATA
+
+   .align 8
+   .globl revocation_certificate_list
+revocation_certificate_list:
+__revocation_list_start:
+   .incbin "certs/x509_revocation_list"
+__revocation_list_end:
+
+   .align 8
+   .globl revocation_certificate_list_size
+revocation_certificate_list_size:
+#ifdef CONFIG_64BIT
+   .quad __revocation_list_end - __revocation_list_start
+#else
+   

[PATCH v5 2/4] certs: Move load_system_certificate_list to a common function

2021-01-22 Thread Eric Snowberg
Move functionality within load_system_certificate_list to a common
function, so it can be reused in the future.

Signed-off-by: Eric Snowberg 
Acked-by: Jarkko Sakkinen 
---
 certs/Makefile |  2 +-
 certs/common.c | 56 ++
 certs/common.h |  9 +++
 certs/system_keyring.c | 49 +++-
 4 files changed, 69 insertions(+), 47 deletions(-)
 create mode 100644 certs/common.c
 create mode 100644 certs/common.h

diff --git a/certs/Makefile b/certs/Makefile
index f4c25b67aad9..f4b90bad8690 100644
--- a/certs/Makefile
+++ b/certs/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the linux kernel signature checking certificates.
 #
 
-obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
+obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o 
common.o
 obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o
 ifneq ($(CONFIG_SYSTEM_BLACKLIST_HASH_LIST),"")
 obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist_hashes.o
diff --git a/certs/common.c b/certs/common.c
new file mode 100644
index ..83800f51a1a1
--- /dev/null
+++ b/certs/common.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include 
+#include 
+
+int load_certificate_list(const u8 cert_list[],
+ const unsigned long list_size,
+ const struct key *keyring)
+{
+   key_ref_t key;
+   const u8 *p, *end;
+   size_t plen;
+
+   p = cert_list;
+   end = p + list_size;
+   while (p < end) {
+   /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
+* than 256 bytes in size.
+*/
+   if (end - p < 4)
+   goto dodgy_cert;
+   if (p[0] != 0x30 &&
+   p[1] != 0x82)
+   goto dodgy_cert;
+   plen = (p[2] << 8) | p[3];
+   plen += 4;
+   if (plen > end - p)
+   goto dodgy_cert;
+
+   key = key_create_or_update(make_key_ref(keyring, 1),
+  "asymmetric",
+  NULL,
+  p,
+  plen,
+  ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+  KEY_USR_VIEW | KEY_USR_READ),
+  KEY_ALLOC_NOT_IN_QUOTA |
+  KEY_ALLOC_BUILT_IN |
+  KEY_ALLOC_BYPASS_RESTRICTION);
+   if (IS_ERR(key)) {
+   pr_err("Problem loading in-kernel X.509 certificate 
(%ld)\n",
+  PTR_ERR(key));
+   } else {
+   pr_notice("Loaded X.509 cert '%s'\n",
+ key_ref_to_ptr(key)->description);
+   key_ref_put(key);
+   }
+   p += plen;
+   }
+
+   return 0;
+
+dodgy_cert:
+   pr_err("Problem parsing in-kernel X.509 certificate list\n");
+   return 0;
+}
diff --git a/certs/common.h b/certs/common.h
new file mode 100644
index ..abdb5795936b
--- /dev/null
+++ b/certs/common.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _CERT_COMMON_H
+#define _CERT_COMMON_H
+
+int load_certificate_list(const u8 cert_list[], const unsigned long list_size,
+ const struct key *keyring);
+
+#endif
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index cc165b359ea3..a44a8915c94c 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include "common.h"
 
 static struct key *builtin_trusted_keys;
 #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING
@@ -136,54 +137,10 @@ device_initcall(system_trusted_keyring_init);
  */
 static __init int load_system_certificate_list(void)
 {
-   key_ref_t key;
-   const u8 *p, *end;
-   size_t plen;
-
pr_notice("Loading compiled-in X.509 certificates\n");
 
-   p = system_certificate_list;
-   end = p + system_certificate_list_size;
-   while (p < end) {
-   /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
-* than 256 bytes in size.
-*/
-   if (end - p < 4)
-   goto dodgy_cert;
-   if (p[0] != 0x30 &&
-   p[1] != 0x82)
-   goto dodgy_cert;
-   plen = (p[2] << 8) | p[3];
-   plen += 4;
-   if (plen > end - p)
-   goto dodgy_cert;
-
-   key = key_create_or_update(make_key_re

Re: [RFC PATCH v3 1/8] Use refcount_t for ucounts reference counting

2021-01-21 Thread Eric W. Biederman
Alexey Gladkov  writes:

> On Tue, Jan 19, 2021 at 07:57:36PM -0600, Eric W. Biederman wrote:
>> Alexey Gladkov  writes:
>> 
>> > On Mon, Jan 18, 2021 at 12:34:29PM -0800, Linus Torvalds wrote:
>> >> On Mon, Jan 18, 2021 at 11:46 AM Alexey Gladkov
>> >>  wrote:
>> >> >
>> >> > Sorry about that. I thought that this code is not needed when switching
>> >> > from int to refcount_t. I was wrong.
>> >> 
>> >> Well, you _may_ be right. I personally didn't check how the return
>> >> value is used.
>> >> 
>> >> I only reacted to "it certainly _may_ be used, and there is absolutely
>> >> no comment anywhere about why it wouldn't matter".
>> >
>> > I have not found examples where checked the overflow after calling
>> > refcount_inc/refcount_add.
>> >
>> > For example in kernel/fork.c:2298 :
>> >
>> >current->signal->nr_threads++;   
>> >atomic_inc(¤t->signal->live);  
>> >refcount_inc(¤t->signal->sigcnt);  
>> >
>> > $ semind search signal_struct.sigcnt
>> > def include/linux/sched/signal.h:83refcount_t  
>> > sigcnt;
>> > m-- kernel/fork.c:723 put_signal_structif 
>> > (refcount_dec_and_test(&sig->sigcnt))
>> > m-- kernel/fork.c:1571 copy_signal 
>> > refcount_set(&sig->sigcnt, 1);
>> > m-- kernel/fork.c:2298 copy_process
>> > refcount_inc(¤t->signal->sigcnt);
>> >
>> > It seems to me that the only way is to use __refcount_inc and then compare
>> > the old value with REFCOUNT_MAX
>> >
>> > Since I have not seen examples of such checks, I thought that this is
>> > acceptable. Sorry once again. I have not tried to hide these changes.
>> 
>> The current ucount code does check for overflow and fails the increment
>> in every case.
>> 
>> So arguably it will be a regression and inferior error handling behavior
>> if the code switches to the ``better'' refcount_t data structure.
>> 
>> I originally didn't use refcount_t because silently saturating and not
>> bothering to handle the error makes me uncomfortable.
>> 
>> Not having to acquire the ucounts_lock every time seems nice.  Perhaps
>> the path forward would be to start with stupid/correct code that always
>> takes the ucounts_lock for every increment of ucounts->count, that is
>> later replaced with something more optimal.
>> 
>> Not impacting performance in the non-namespace cases and having good
>> performance in the other cases is a fundamental requirement of merging
>> code like this.
>
> Did I understand your suggestion correctly that you suggest to use
> spin_lock for atomic_read and atomic_inc ?
>
> If so, then we are already incrementing the counter under ucounts_lock.
>
>   ...
>   if (atomic_read(&ucounts->count) == INT_MAX)
>   ucounts = NULL;
>   else
>   atomic_inc(&ucounts->count);
>   spin_unlock_irq(&ucounts_lock);
>   return ucounts;
>
> something like this ?

Yes.  But without atomics.  Something a bit more like:
>   ...
>   if (ucounts->count == INT_MAX)
>   ucounts = NULL;
>   else
>   ucounts->count++;
>   spin_unlock_irq(&ucounts_lock);
>   return ucounts;

I do believe at some point we will want to say using the spin_lock for
ucounts->count is cumbersome, and suboptimal and we want to change it to
get a better performing implementation.

Just for getting the semantics correct we should be able to use just
ucounts_lock for locking.  Then when everything is working we can
profile and optimize the code.

I just don't want figuring out what is needed to get hung up over little
details that we can change later.

Eric



[RFC][PATCH] apparmor: Enforce progressively tighter permissions for no_new_privs

2021-01-20 Thread Eric W. Biederman


The current understanding of apparmor with respect to no_new_privs is at
odds with how no_new_privs is implemented and understood by the rest of
the kernel.

The documentation of no_new_privs states:
> With ``no_new_privs`` set, ``execve()`` promises not to grant the
> privilege to do anything that could not have been done without the
> execve call.

And reading through the kernel except for apparmor that description
matches what is implemented.

There are two major divergences of apparmor from this definition:
- proc_setattr enforces limitations when no_new_privs are set.
- the limitation is enforced from the apparent time when no_new_privs is
  set instead of guaranteeing that each execve has progressively more
  narrow permissions.

The code in apparmor that attempts to discover the apparmor label at the
point where no_new_privs is set is not robust.  The capture happens a
long time after no_new_privs is set.

Capturing the label at the point where no_new_privs is set is
practically impossible to implement robustly.  Today the rule is struct
cred can only be changed by it's current task.  Today
SECCOMP_FILTER_FLAG_TSYNC sets no_new_privs from another thread.  A
robust implementation would require changing something fundamental in
how creds are managed for SECCOMP_FILTER_FLAG_TSYNC to be able to
capture the cred at the point it is set.

Futhermore given the consistent documentation and how everything else
implements no_new_privs, not having the permissions get progressively
tighter is a footgun aimed at userspace.  I fully expect it to break any
security sensitive software that uses no_new_privs and was not
deliberately designed and tested against apparmor.

Avoid the questionable and hard to fix implementation and the
potential to confuse userspace by having no_new_privs enforce
progressinvely tighter permissions.

Fixes: 9fcf78cca198 ("apparmor: update domain transitions that are subsets of 
confinement at nnp")
Signed-off-by: Eric W. Biederman 
---

I came accross this while examining the places cred_guard_mutex is
used and trying to find a way to make those code paths less insane.

If it would be more pallatable I would not mind removing the
task_no_new_privs test entirely from aa_change_hat and aa_change_profile
as those are not part of exec, so arguably no_new_privs should not care
about them at all.

Can we please get rid of the huge semantic wart and pain in the implementation?

 security/apparmor/domain.c   | 39 
 security/apparmor/include/task.h |  4 
 security/apparmor/task.c |  7 --
 3 files changed, 4 insertions(+), 46 deletions(-)

diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index f919ebd042fd..8f77059bf890 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -869,17 +869,6 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm)
 
label = aa_get_newest_label(cred_label(bprm->cred));
 
-   /*
-* Detect no new privs being set, and store the label it
-* occurred under. Ideally this would happen when nnp
-* is set but there isn't a good way to do that yet.
-*
-* Testing for unconfined must be done before the subset test
-*/
-   if ((bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS) && !unconfined(label) &&
-   !ctx->nnp)
-   ctx->nnp = aa_get_label(label);
-
/* buffer freed below, name is pointer into buffer */
buffer = aa_get_buffer(false);
if (!buffer) {
@@ -915,7 +904,7 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm *bprm)
 */
if ((bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS) &&
!unconfined(label) &&
-   !aa_label_is_unconfined_subset(new, ctx->nnp)) {
+   !aa_label_is_unconfined_subset(new, label)) {
error = -EPERM;
info = "no new privs";
goto audit;
@@ -1158,16 +1147,6 @@ int aa_change_hat(const char *hats[], int count, u64 
token, int flags)
label = aa_get_newest_cred_label(cred);
previous = aa_get_newest_label(ctx->previous);
 
-   /*
-* Detect no new privs being set, and store the label it
-* occurred under. Ideally this would happen when nnp
-* is set but there isn't a good way to do that yet.
-*
-* Testing for unconfined must be done before the subset test
-*/
-   if (task_no_new_privs(current) && !unconfined(label) && !ctx->nnp)
-   ctx->nnp = aa_get_label(label);
-
if (unconfined(label)) {
info = "unconfined can not change_hat";
error = -EPERM;
@@ -1193,7 +1172,7 @@ int aa_change_hat(const char *hats[], int count, u64 
token, int flags)
 * reduce restrictions.
 */

Re: [RFC][PATCH] apparmor: Enforce progressively tighter permissions for no_new_privs

2021-01-20 Thread Eric W. Biederman


TL;DR selinux and apparmor ignore no_new_privs

What?


John Johansen  writes:

> On 1/20/21 1:26 PM, Eric W. Biederman wrote:
>> 
>> The current understanding of apparmor with respect to no_new_privs is at
>> odds with how no_new_privs is implemented and understood by the rest of
>> the kernel.
>> 
>> The documentation of no_new_privs states:
>>> With ``no_new_privs`` set, ``execve()`` promises not to grant the
>>> privilege to do anything that could not have been done without the
>>> execve call.
>> 
>> And reading through the kernel except for apparmor that description
>> matches what is implemented.
>> 
>
> That is not correct.
>
> commit 7b0d0b40cd78 ("selinux: Permit bounded transitions under
> NO_NEW_PRIVS or NOSUID.")
>
> Allows for bound transitions under selinux
> and

As I understand a bound transition it is a transition to a state with
a set of permissions that are a subset of what was previously held.
Which is consistent with the mandate of no_new_privs.

> commit af63f4193f9f selinux: Generalize support for NNP/nosuid SELinux
> domain transitions
>
> goes further and "Decouple NNP/nosuid from SELinux transitions".

Yes.  Looking at that commit I do see that selinux appears to be
deliberately ignoring no_new_privs in specific cases.

WTF.

>> There are two major divergences of apparmor from this definition:
>> - proc_setattr enforces limitations when no_new_privs are set.
>> - the limitation is enforced from the apparent time when no_new_privs is
>>   set instead of guaranteeing that each execve has progressively more
>>   narrow permissions.
>> 
>> The code in apparmor that attempts to discover the apparmor label at the
>> point where no_new_privs is set is not robust.  The capture happens a
>> long time after no_new_privs is set.
>> 
>
> yes, but that shouldn't matter. As apparmor has not changed its label
> at any point between when no_new_privs was set and when the check is
> done. AppArmor is attempting to change it label, and if it finds NNP
> has been set we capture what the confinement was.
>
>> Capturing the label at the point where no_new_privs is set is
>> practically impossible to implement robustly.  Today the rule is struct
>> cred can only be changed by it's current task.  Today
>
> right, and apparmor only ever has the task update its own label.
>
>> SECCOMP_FILTER_FLAG_TSYNC sets no_new_privs from another thread.  A
>> robust implementation would require changing something fundamental in
>> how creds are managed for SECCOMP_FILTER_FLAG_TSYNC to be able to
>> capture the cred at the point it is set.
>> 
> I am open to supporting something like that.

I can't see how it would be possible to be robust without completely
changing the locking.  Locking that right now in a simpler model we have
not figured out how to make obviously correct.

>> Futhermore given the consistent documentation and how everything else
>> implements no_new_privs, not having the permissions get progressively
>
> Again see above

Except where selinux deliberately ignores no_new_privs this is
consitent.

>> tighter is a footgun aimed at userspace.  I fully expect it to break any
>
> tighter is somewhat relative, nor is it only progressively tighter it
> is bounded against the snapshot of the label that was on the task.

Which is the BUG I am reporting.  It should be progressingly tighter.

>> security sensitive software that uses no_new_privs and was not
>> deliberately designed and tested against apparmor.
>> 
>
> Currently the situation has become either an either or choice between
> the LSM and NNP. We are trying to walk a balance. Ideally apparmor
> would like to do something similar to selinux and decouple the label
> transition from NNP and nosuid via an internal capability, but we
> have not gone there yet.

Why do you need to escape no_new_privs.  Why does anyone need to escape
no_new_privs?

>> Avoid the questionable and hard to fix implementation and the
>> potential to confuse userspace by having no_new_privs enforce
>> progressinvely tighter permissions.
>> 
>
> This would completely break several use cases.

Enforcing no_new_privs as documented would break userspace?

Isn't the opposite true that you are breaking people by not enforcing
it?

>> Fixes: 9fcf78cca198 ("apparmor: update domain transitions that are subsets 
>> of confinement at nnp")
>> Signed-off-by: Eric W. Biederman 
>> ---
>> 
>> I came accross this while examining the places cred_guard_mutex is
>> used and trying to find a way to make those code paths less 

Re: [PATCH v4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-20 Thread Eric Snowberg


> On Jan 20, 2021, at 4:26 AM, Jarkko Sakkinen  wrote:
> 
> On Fri, Jan 15, 2021 at 09:49:02AM -0700, Eric Snowberg wrote:
>> 
>>> On Jan 15, 2021, at 2:15 AM, Jarkko Sakkinen  wrote:
>>> 
>>> On Wed, Jan 13, 2021 at 05:11:10PM -0700, Eric Snowberg wrote:
>>>> 
>>>>> On Jan 13, 2021, at 1:41 PM, Jarkko Sakkinen 
>>>>>  wrote:
>>>>> 
>>>>> On Tue, Jan 12, 2021 at 02:57:39PM +, David Howells wrote:
>>>>>> Eric Snowberg  wrote:
>>>>>> 
>>>>>>>> On Dec 10, 2020, at 2:49 AM, David Howells  wrote:
>>>>>>>> 
>>>>>>>> Eric Snowberg  wrote:
>>>>>>>> 
>>>>>>>>> Add support for EFI_CERT_X509_GUID dbx entries. When a 
>>>>>>>>> EFI_CERT_X509_GUID
>>>>>>>>> is found, it is added as an asymmetrical key to the .blacklist 
>>>>>>>>> keyring.
>>>>>>>>> Anytime the .platform keyring is used, the keys in the .blacklist 
>>>>>>>>> keyring
>>>>>>>>> are referenced, if a matching key is found, the key will be rejected.
>>>>>>>> 
>>>>>>>> Ummm...  Why this way and not as a blacklist key which takes up less 
>>>>>>>> space?
>>>>>>>> I'm guessing that you're using the key chain matching logic.  We 
>>>>>>>> really only
>>>>>>>> need to blacklist the key IDs.
>>>>>>> 
>>>>>>> I implemented it this way so that certs in the dbx would only impact 
>>>>>>> the .platform keyring. I was under the impression we didn’t want to 
>>>>>>> have 
>>>>>>> Secure Boot UEFI db/dbx certs dictate keyring functionality within the 
>>>>>>> kernel
>>>>>>> itself. Meaning if we have a matching dbx cert in any other keyring 
>>>>>>> (builtin,
>>>>>>> secondary, ima, etc.), it would be allowed. If that is not how you’d 
>>>>>>> like to 
>>>>>>> see it done, let me know and I’ll make the change.
>>>>>> 
>>>>>> I wonder if that is that the right thing to do.  I guess this is a policy
>>>>>> decision and may depend on the particular user.
>>>>> 
>>>>> Why would you want to allow dbx entry in any keyring?
>>>> 
>>>> Today, DB and MOK certs go into the platform keyring.  These certs are only
>>>> referenced during kexec.  They can’t be used for other things like 
>>>> validating
>>>> kernel module signatures.  If we follow the same pattern, the DBX and MOKX 
>>>> entries
>>>> in the blacklist keyring should only impact kexec. 
>>>> 
>>>> Currently, Mickaël Salaün has another outstanding series to allow root to 
>>>> update 
>>>> the blacklist keyring.  I assume the use case for this is around 
>>>> certificates used 
>>>> within the kernel, for example revoking kernel module signatures.  The 
>>>> question I have
>>>> is, should another keyring be introduced?  One that carries DBX and MOKX, 
>>>> which just
>>>> correspond to certs/hashes in the platform keyring; this keyring would 
>>>> only be
>>>> referenced for kexec, just like the platform keyring is today. Then, the 
>>>> current
>>>> blacklist keyring would be used for everything internal to the kernel.
>>> 
>>> Right, I'm following actively that series.
>>> 
>>> Why couldn't user space drive this process and use that feature to do it?
>> 
>> I could see where the user would want to use both. With Mickaël Salaün’s
>> series, the blacklist keyring is updated immediately.  However it does
>> not survive a reboot.  With my patch, the blacklist keyring is updated
>> during boot, based on what is in the dbx. Neither approach needs a new 
>> kernel build.
> 
> I don't want to purposely challenge this, but why does it matter
> that it doesn't survive the boot? I'm referring here to the golden
> principle of kernel defining a mechanism, not policy. User space
> can do the population however it wants to for every boot.
> 
> E.g. systemd service could do this.
> 
> What am I missing here?

This change simply adds support for a missing type.  The kernel 
already supports cert and hash entries (EFI_CERT_X509_SHA256_GUID,
EFI_CERT_SHA256_GUID) that originate from the dbx and are loaded 
into the blacklist keyring during boot.  I’m not sure why a cert 
defined with EFI_CERT_X509_GUID should be handled in a different 
manner.

I suppose a user space tool could be created. But wouldn’t what is
currently done in the kernel in this area need to be removed?



Re: [RFC][PATCH] apparmor: Enforce progressively tighter permissions for no_new_privs

2021-01-20 Thread Eric W. Biederman


This should now Cc the correct email address for James Morris.

ebied...@xmission.com (Eric W. Biederman) writes:

> The current understanding of apparmor with respect to no_new_privs is at
> odds with how no_new_privs is implemented and understood by the rest of
> the kernel.
>
> The documentation of no_new_privs states:
>> With ``no_new_privs`` set, ``execve()`` promises not to grant the
>> privilege to do anything that could not have been done without the
>> execve call.
>
> And reading through the kernel except for apparmor that description
> matches what is implemented.
>
> There are two major divergences of apparmor from this definition:
> - proc_setattr enforces limitations when no_new_privs are set.
> - the limitation is enforced from the apparent time when no_new_privs is
>   set instead of guaranteeing that each execve has progressively more
>   narrow permissions.
>
> The code in apparmor that attempts to discover the apparmor label at the
> point where no_new_privs is set is not robust.  The capture happens a
> long time after no_new_privs is set.
>
> Capturing the label at the point where no_new_privs is set is
> practically impossible to implement robustly.  Today the rule is struct
> cred can only be changed by it's current task.  Today
> SECCOMP_FILTER_FLAG_TSYNC sets no_new_privs from another thread.  A
> robust implementation would require changing something fundamental in
> how creds are managed for SECCOMP_FILTER_FLAG_TSYNC to be able to
> capture the cred at the point it is set.
>
> Futhermore given the consistent documentation and how everything else
> implements no_new_privs, not having the permissions get progressively
> tighter is a footgun aimed at userspace.  I fully expect it to break any
> security sensitive software that uses no_new_privs and was not
> deliberately designed and tested against apparmor.
>
> Avoid the questionable and hard to fix implementation and the
> potential to confuse userspace by having no_new_privs enforce
> progressinvely tighter permissions.
>
> Fixes: 9fcf78cca198 ("apparmor: update domain transitions that are subsets of 
> confinement at nnp")
> Signed-off-by: Eric W. Biederman 
> ---
>
> I came accross this while examining the places cred_guard_mutex is
> used and trying to find a way to make those code paths less insane.
>
> If it would be more pallatable I would not mind removing the
> task_no_new_privs test entirely from aa_change_hat and aa_change_profile
> as those are not part of exec, so arguably no_new_privs should not care
> about them at all.
>
> Can we please get rid of the huge semantic wart and pain in the 
> implementation?
>
>  security/apparmor/domain.c   | 39 
>  security/apparmor/include/task.h |  4 
>  security/apparmor/task.c |  7 --
>  3 files changed, 4 insertions(+), 46 deletions(-)
>
> diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
> index f919ebd042fd..8f77059bf890 100644
> --- a/security/apparmor/domain.c
> +++ b/security/apparmor/domain.c
> @@ -869,17 +869,6 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm 
> *bprm)
>  
>   label = aa_get_newest_label(cred_label(bprm->cred));
>  
> - /*
> -  * Detect no new privs being set, and store the label it
> -  * occurred under. Ideally this would happen when nnp
> -  * is set but there isn't a good way to do that yet.
> -  *
> -  * Testing for unconfined must be done before the subset test
> -  */
> - if ((bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS) && !unconfined(label) &&
> - !ctx->nnp)
> - ctx->nnp = aa_get_label(label);
> -
>   /* buffer freed below, name is pointer into buffer */
>   buffer = aa_get_buffer(false);
>   if (!buffer) {
> @@ -915,7 +904,7 @@ int apparmor_bprm_creds_for_exec(struct linux_binprm 
> *bprm)
>*/
>   if ((bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS) &&
>   !unconfined(label) &&
> - !aa_label_is_unconfined_subset(new, ctx->nnp)) {
> + !aa_label_is_unconfined_subset(new, label)) {
>   error = -EPERM;
>   info = "no new privs";
>   goto audit;
> @@ -1158,16 +1147,6 @@ int aa_change_hat(const char *hats[], int count, u64 
> token, int flags)
>   label = aa_get_newest_cred_label(cred);
>   previous = aa_get_newest_label(ctx->previous);
>  
> - /*
> -  * Detect no new privs being set, and store the label it
> -  * occurred under. Ideally this would happen when nnp
> -  * is set but there isn't a good way to do th

Re: [PATCH] Increase limit of max_user_watches from 1/25 to 1/16

2021-01-20 Thread Eric Curtin
On Wed, 20 Jan 2021 at 13:02, Eric Curtin  wrote:
>
> The current default value for  max_user_watches  is the 1/16 (6.25%) of
> the available low memory, divided for the "watch" cost in bytes.
>
> Tools like inotify-tools and visual studio code, seem to hit these
> limits a little to easy.
>
> Also amending the documentation, it referred to an old value for this.
>
> Signed-off-by: Eric Curtin 
> ---
>  Documentation/admin-guide/sysctl/fs.rst | 4 ++--
>  fs/eventpoll.c  | 4 ++--
>  2 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/admin-guide/sysctl/fs.rst 
> b/Documentation/admin-guide/sysctl/fs.rst
> index f48277a0a850..f7fe45e69c41 100644
> --- a/Documentation/admin-guide/sysctl/fs.rst
> +++ b/Documentation/admin-guide/sysctl/fs.rst
> @@ -380,5 +380,5 @@ This configuration option sets the maximum number of 
> "watches" that are
>  allowed for each user.
>  Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
>  on a 64bit one.
> -The current default value for  max_user_watches  is the 1/32 of the available
> -low memory, divided for the "watch" cost in bytes.
> +The current default value for  max_user_watches  is the 1/16 (6.25%) of the
> +available low memory, divided for the "watch" cost in bytes.
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index a829af074eb5..de9ef8f6d0b2 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -2352,9 +2352,9 @@ static int __init eventpoll_init(void)
>
> si_meminfo(&si);
> /*
> -* Allows top 4% of lomem to be allocated for epoll watches (per 
> user).
> +* Allows top 6.25% of lomem to be allocated for epoll watches (per 
> user).
>  */
> -   max_user_watches = (((si.totalram - si.totalhigh) / 25) << 
> PAGE_SHIFT) /
> +   max_user_watches = (((si.totalram - si.totalhigh) / 16) << 
> PAGE_SHIFT) /
> EP_ITEM_COST;
> BUG_ON(max_user_watches < 0);
>
> --
> 2.25.1
>

Please ignore this, this is the wrong limit (an epoll one), I sent
another patch just
to update the documentation to be correct. Weiman Long already kindly solved the
issue in 92890123749bafc317bbfacbe0a62ce08d78efb7

Separate patch is titled "[PATCH] Update
Documentation/admin-guide/sysctl/fs.rst"


[PATCH] Increase limit of max_user_watches from 1/25 to 1/16

2021-01-20 Thread Eric Curtin
The current default value for  max_user_watches  is the 1/16 (6.25%) of
the available low memory, divided for the "watch" cost in bytes.

Tools like inotify-tools and visual studio code, seem to hit these
limits a little to easy.

Also amending the documentation, it referred to an old value for this.

Signed-off-by: Eric Curtin 
---
 Documentation/admin-guide/sysctl/fs.rst | 4 ++--
 fs/eventpoll.c  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/fs.rst 
b/Documentation/admin-guide/sysctl/fs.rst
index f48277a0a850..f7fe45e69c41 100644
--- a/Documentation/admin-guide/sysctl/fs.rst
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -380,5 +380,5 @@ This configuration option sets the maximum number of 
"watches" that are
 allowed for each user.
 Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
 on a 64bit one.
-The current default value for  max_user_watches  is the 1/32 of the available
-low memory, divided for the "watch" cost in bytes.
+The current default value for  max_user_watches  is the 1/16 (6.25%) of the
+available low memory, divided for the "watch" cost in bytes.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a829af074eb5..de9ef8f6d0b2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2352,9 +2352,9 @@ static int __init eventpoll_init(void)
 
si_meminfo(&si);
/*
-* Allows top 4% of lomem to be allocated for epoll watches (per user).
+* Allows top 6.25% of lomem to be allocated for epoll watches (per 
user).
 */
-   max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
+   max_user_watches = (((si.totalram - si.totalhigh) / 16) << PAGE_SHIFT) /
EP_ITEM_COST;
BUG_ON(max_user_watches < 0);
 
-- 
2.25.1



[PATCH] Update Documentation/admin-guide/sysctl/fs.rst

2021-01-20 Thread Eric Curtin
max_user_watches for epoll should say 1/25, rather than 1/32

Signed-off-by: Eric Curtin 
---
 Documentation/admin-guide/sysctl/fs.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/fs.rst 
b/Documentation/admin-guide/sysctl/fs.rst
index f48277a0a850..2a501c9ddc55 100644
--- a/Documentation/admin-guide/sysctl/fs.rst
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -380,5 +380,5 @@ This configuration option sets the maximum number of 
"watches" that are
 allowed for each user.
 Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
 on a 64bit one.
-The current default value for  max_user_watches  is the 1/32 of the available
-low memory, divided for the "watch" cost in bytes.
+The current default value for  max_user_watches  is the 1/25 (4%) of the
+available low memory, divided for the "watch" cost in bytes.
-- 
2.25.1



Re: [PATCH net] tcp: Fix potential use-after-free due to double kfree().

2021-01-20 Thread Eric Dumazet
On Wed, Jan 20, 2021 at 2:17 AM Jakub Kicinski  wrote:
>
> On Mon, 18 Jan 2021 14:59:20 +0900 Kuniyuki Iwashima wrote:
> > Receiving ACK with a valid SYN cookie, cookie_v4_check() allocates struct
> > request_sock and then can allocate inet_rsk(req)->ireq_opt. After that,
> > tcp_v4_syn_recv_sock() allocates struct sock and copies ireq_opt to
> > inet_sk(sk)->inet_opt. Normally, tcp_v4_syn_recv_sock() inserts the full
> > socket into ehash and sets NULL to ireq_opt. Otherwise,
> > tcp_v4_syn_recv_sock() has to reset inet_opt by NULL and free the full
> > socket.
> >
> > The commit 01770a1661657 ("tcp: fix race condition when creating child
> > sockets from syncookies") added a new path, in which more than one cores
> > create full sockets for the same SYN cookie. Currently, the core which
> > loses the race frees the full socket without resetting inet_opt, resulting
> > in that both sock_put() and reqsk_put() call kfree() for the same memory:
> >
> >   sock_put
> > sk_free
> >   __sk_free
> > sk_destruct
> >   __sk_destruct
> > sk->sk_destruct/inet_sock_destruct
> >   kfree(rcu_dereference_protected(inet->inet_opt, 1));
> >
> >   reqsk_put
> > reqsk_free
> >   __reqsk_free
> > req->rsk_ops->destructor/tcp_v4_reqsk_destructor
> >   kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
> >
> > Calling kmalloc() between the double kfree() can lead to use-after-free, so
> > this patch fixes it by setting NULL to inet_opt before sock_put().
> >
> > As a side note, this kind of issue does not happen for IPv6. This is
> > because tcp_v6_syn_recv_sock() clones both ipv6_opt and pktopts which
> > correspond to ireq_opt in IPv4.
> >
> > Fixes: 01770a166165 ("tcp: fix race condition when creating child sockets 
> > from syncookies")
> > CC: Ricardo Dias 
> > Signed-off-by: Kuniyuki Iwashima 
> > Reviewed-by: Benjamin Herrenschmidt 
>
> Ricardo, Eric, any reason this was written this way?

Well, I guess that was a plain bug.

IPv4 options are not used often I think.

Reviewed-by: Eric Dumazet 


Re: [RFC v3 2/2] vfio/platform: msi: add Broadcom platform devices

2021-01-20 Thread Auger Eric
Hi Alex,

On 1/19/21 11:45 PM, Alex Williamson wrote:
> On Fri, 15 Jan 2021 10:24:33 +0100
> Auger Eric  wrote:
> 
>> Hi Vikas,
>> On 1/15/21 7:35 AM, Vikas Gupta wrote:
>>> Hi Eric,
>>>
>>> On Tue, Jan 12, 2021 at 2:52 PM Auger Eric  wrote:  
>>>>
>>>> Hi Vikas,
>>>>
>>>> On 12/14/20 6:45 PM, Vikas Gupta wrote:  
>>>>> Add msi support for Broadcom platform devices
>>>>>
>>>>> Signed-off-by: Vikas Gupta 
>>>>> ---
>>>>>  drivers/vfio/platform/Kconfig |  1 +
>>>>>  drivers/vfio/platform/Makefile|  1 +
>>>>>  drivers/vfio/platform/msi/Kconfig |  9 
>>>>>  drivers/vfio/platform/msi/Makefile|  2 +
>>>>>  .../vfio/platform/msi/vfio_platform_bcmplt.c  | 49 +++
>>>>>  5 files changed, 62 insertions(+)
>>>>>  create mode 100644 drivers/vfio/platform/msi/Kconfig
>>>>>  create mode 100644 drivers/vfio/platform/msi/Makefile
>>>>>  create mode 100644 drivers/vfio/platform/msi/vfio_platform_bcmplt.c  
>>>> what does plt mean?  
>>> This(plt) is a generic name for Broadcom platform devices, which we`ll
>>>  plan to add in this file. Currently we have only one in this file.
>>> Do you think this name does not sound good here?  
>>
>> we have VFIO_PLATFORM_BCMFLEXRM_RESET config which also applied to vfio
>> flex-rm platform device.
>>
>> I think it would be more homegenous to have VFIO_PLATFORM_BCMFLEXRM_MSI
>> in case we keep a separate msi module.
>>
>> also in reset dir we have vfio_platform_bcmflexrm.c
>>
>>
>>>>>
>>>>> diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
>>>>> index dc1a3c44f2c6..7b8696febe61 100644
>>>>> --- a/drivers/vfio/platform/Kconfig
>>>>> +++ b/drivers/vfio/platform/Kconfig
>>>>> @@ -21,3 +21,4 @@ config VFIO_AMBA
>>>>> If you don't know what to do here, say N.
>>>>>
>>>>>  source "drivers/vfio/platform/reset/Kconfig"
>>>>> +source "drivers/vfio/platform/msi/Kconfig"
>>>>> diff --git a/drivers/vfio/platform/Makefile 
>>>>> b/drivers/vfio/platform/Makefile
>>>>> index 3f3a24e7c4ef..9ccdcdbf0e7e 100644
>>>>> --- a/drivers/vfio/platform/Makefile
>>>>> +++ b/drivers/vfio/platform/Makefile
>>>>> @@ -5,6 +5,7 @@ vfio-platform-y := vfio_platform.o
>>>>>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
>>>>>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform-base.o
>>>>>  obj-$(CONFIG_VFIO_PLATFORM) += reset/
>>>>> +obj-$(CONFIG_VFIO_PLATFORM) += msi/
>>>>>
>>>>>  vfio-amba-y := vfio_amba.o
>>>>>
>>>>> diff --git a/drivers/vfio/platform/msi/Kconfig 
>>>>> b/drivers/vfio/platform/msi/Kconfig
>>>>> new file mode 100644
>>>>> index ..54d6b70e1e32
>>>>> --- /dev/null
>>>>> +++ b/drivers/vfio/platform/msi/Kconfig
>>>>> @@ -0,0 +1,9 @@
>>>>> +# SPDX-License-Identifier: GPL-2.0-only
>>>>> +config VFIO_PLATFORM_BCMPLT_MSI
>>>>> + tristate "MSI support for Broadcom platform devices"
>>>>> + depends on VFIO_PLATFORM && (ARCH_BCM_IPROC || COMPILE_TEST)
>>>>> + default ARCH_BCM_IPROC
>>>>> + help
>>>>> +   Enables the VFIO platform driver to handle msi for Broadcom 
>>>>> devices
>>>>> +
>>>>> +   If you don't know what to do here, say N.
>>>>> diff --git a/drivers/vfio/platform/msi/Makefile 
>>>>> b/drivers/vfio/platform/msi/Makefile
>>>>> new file mode 100644
>>>>> index ..27422d45cecb
>>>>> --- /dev/null
>>>>> +++ b/drivers/vfio/platform/msi/Makefile
>>>>> @@ -0,0 +1,2 @@
>>>>> +# SPDX-License-Identifier: GPL-2.0
>>>>> +obj-$(CONFIG_VFIO_PLATFORM_BCMPLT_MSI) += vfio_platform_bcmplt.o
>>>>> diff --git a/drivers/vfio/platform/msi/vfio_platform_bcmplt.c 
>>>>> b/drivers/vfio/platform/msi/vfio_platform_bcmplt.c
>>>>> new file mode 100644
>>>>> index ..a074b5e92d77
>>>>> --- /dev/null
>>>>> +++ b/d

Re: [dm-devel] [PATCH AUTOSEL 5.4 03/26] dm integrity: select CRYPTO_SKCIPHER

2021-01-19 Thread Eric Biggers
On Tue, Jan 19, 2021 at 08:26:40PM -0500, Sasha Levin wrote:
> From: Anthony Iliopoulos 
> 
> [ Upstream commit f7b347acb5f6c29d9229bb64893d8b6a2c7949fb ]
> 
> The integrity target relies on skcipher for encryption/decryption, but
> certain kernel configurations may not enable CRYPTO_SKCIPHER, leading to
> compilation errors due to unresolved symbols. Explicitly select
> CRYPTO_SKCIPHER for DM_INTEGRITY, since it is unconditionally dependent
> on it.
> 
> Signed-off-by: Anthony Iliopoulos 
> Signed-off-by: Mike Snitzer 
> Signed-off-by: Sasha Levin 
> ---
>  drivers/md/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
> index aa98953f4462e..7dd6e98257c72 100644
> --- a/drivers/md/Kconfig
> +++ b/drivers/md/Kconfig
> @@ -565,6 +565,7 @@ config DM_INTEGRITY
>   select BLK_DEV_INTEGRITY
>   select DM_BUFIO
>   select CRYPTO
> + select CRYPTO_SKCIPHER
>   select ASYNC_XOR
>   ---help---
> This device-mapper target emulates a block device that has

CRYPTO_SKCIPHER doesn't exist in 5.4 and earlier because it was renamed from
CRYPTO_BLKCIPHER in 5.5.  If this patch is really important enough to backport,
CRYPTO_SKCIPHER will need to be changed to CRYPTO_BLKCIPHER.

- Eric


Re: [RFC PATCH v3 1/8] Use refcount_t for ucounts reference counting

2021-01-19 Thread Eric W. Biederman
ebied...@xmission.com (Eric W. Biederman) writes:

> Alexey Gladkov  writes:
>
>> On Mon, Jan 18, 2021 at 12:34:29PM -0800, Linus Torvalds wrote:
>>> On Mon, Jan 18, 2021 at 11:46 AM Alexey Gladkov
>>>  wrote:
>>> >
>>> > Sorry about that. I thought that this code is not needed when switching
>>> > from int to refcount_t. I was wrong.
>>> 
>>> Well, you _may_ be right. I personally didn't check how the return
>>> value is used.
>>> 
>>> I only reacted to "it certainly _may_ be used, and there is absolutely
>>> no comment anywhere about why it wouldn't matter".
>>
>> I have not found examples where checked the overflow after calling
>> refcount_inc/refcount_add.
>>
>> For example in kernel/fork.c:2298 :
>>
>>current->signal->nr_threads++;   
>>atomic_inc(¤t->signal->live);  
>>refcount_inc(¤t->signal->sigcnt);  
>>
>> $ semind search signal_struct.sigcnt
>> def include/linux/sched/signal.h:83  refcount_t  
>> sigcnt;
>> m-- kernel/fork.c:723 put_signal_struct  if 
>> (refcount_dec_and_test(&sig->sigcnt))
>> m-- kernel/fork.c:1571 copy_signal   refcount_set(&sig->sigcnt, 1);
>> m-- kernel/fork.c:2298 copy_process  
>> refcount_inc(¤t->signal->sigcnt);
>>
>> It seems to me that the only way is to use __refcount_inc and then compare
>> the old value with REFCOUNT_MAX
>>
>> Since I have not seen examples of such checks, I thought that this is
>> acceptable. Sorry once again. I have not tried to hide these changes.
>
> The current ucount code does check for overflow and fails the increment
> in every case.
>
> So arguably it will be a regression and inferior error handling behavior
> if the code switches to the ``better'' refcount_t data structure.
>
> I originally didn't use refcount_t because silently saturating and not
> bothering to handle the error makes me uncomfortable.
>
> Not having to acquire the ucounts_lock every time seems nice.  Perhaps
> the path forward would be to start with stupid/correct code that always
> takes the ucounts_lock for every increment of ucounts->count, that is
> later replaced with something more optimal.
>
> Not impacting performance in the non-namespace cases and having good
> performance in the other cases is a fundamental requirement of merging
> code like this.

So starting with something easy to comprehend and simple, may make it
easier to figure out how to optimize the code.

Eric



Re: [RFC PATCH v3 1/8] Use refcount_t for ucounts reference counting

2021-01-19 Thread Eric W. Biederman
Alexey Gladkov  writes:

> On Mon, Jan 18, 2021 at 12:34:29PM -0800, Linus Torvalds wrote:
>> On Mon, Jan 18, 2021 at 11:46 AM Alexey Gladkov
>>  wrote:
>> >
>> > Sorry about that. I thought that this code is not needed when switching
>> > from int to refcount_t. I was wrong.
>> 
>> Well, you _may_ be right. I personally didn't check how the return
>> value is used.
>> 
>> I only reacted to "it certainly _may_ be used, and there is absolutely
>> no comment anywhere about why it wouldn't matter".
>
> I have not found examples where checked the overflow after calling
> refcount_inc/refcount_add.
>
> For example in kernel/fork.c:2298 :
>
>current->signal->nr_threads++;   
>atomic_inc(¤t->signal->live);  
>refcount_inc(¤t->signal->sigcnt);  
>
> $ semind search signal_struct.sigcnt
> def include/linux/sched/signal.h:83   refcount_t  sigcnt;
> m-- kernel/fork.c:723 put_signal_struct   if 
> (refcount_dec_and_test(&sig->sigcnt))
> m-- kernel/fork.c:1571 copy_signalrefcount_set(&sig->sigcnt, 1);
> m-- kernel/fork.c:2298 copy_process   
> refcount_inc(¤t->signal->sigcnt);
>
> It seems to me that the only way is to use __refcount_inc and then compare
> the old value with REFCOUNT_MAX
>
> Since I have not seen examples of such checks, I thought that this is
> acceptable. Sorry once again. I have not tried to hide these changes.

The current ucount code does check for overflow and fails the increment
in every case.

So arguably it will be a regression and inferior error handling behavior
if the code switches to the ``better'' refcount_t data structure.

I originally didn't use refcount_t because silently saturating and not
bothering to handle the error makes me uncomfortable.

Not having to acquire the ucounts_lock every time seems nice.  Perhaps
the path forward would be to start with stupid/correct code that always
takes the ucounts_lock for every increment of ucounts->count, that is
later replaced with something more optimal.

Not impacting performance in the non-namespace cases and having good
performance in the other cases is a fundamental requirement of merging
code like this.

Eric



Re: [PATCH 2/2] security.capability: fix conversions on getxattr

2021-01-19 Thread Eric W. Biederman
Miklos Szeredi  writes:

> If a capability is stored on disk in v2 format cap_inode_getsecurity() will
> currently return in v2 format unconditionally.
>
> This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid,
> and so the same conversions performed on it.
>
> If the rootid cannot be mapped v3 is returned unconverted.  Fix this so
> that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs
> user namespace in case of v2) cannot be mapped in the current user
> namespace.

This looks like a good cleanup.

I do wonder how well this works with stacking.  In particular
ovl_xattr_set appears to call vfs_getxattr without overriding the creds.
What the purpose of that is I haven't quite figured out.  It looks like
it is just a probe to see if an xattr is present so maybe it is ok.

Acked-by: "Eric W. Biederman" 

>
> Signed-off-by: Miklos Szeredi 
> ---
>  security/commoncap.c | 67 
>  1 file changed, 43 insertions(+), 24 deletions(-)
>
> diff --git a/security/commoncap.c b/security/commoncap.c
> index baccd871..c9d99f8f4c82 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -371,10 +371,11 @@ int cap_inode_getsecurity(struct inode *inode, const 
> char *name, void **buffer,
>  {
>   int size, ret;
>   kuid_t kroot;
> + __le32 nsmagic, magic;
>   uid_t root, mappedroot;
>   char *tmpbuf = NULL;
>   struct vfs_cap_data *cap;
> - struct vfs_ns_cap_data *nscap;
> + struct vfs_ns_cap_data *nscap = NULL;
>   struct dentry *dentry;
>   struct user_namespace *fs_ns;
>  
> @@ -396,46 +397,61 @@ int cap_inode_getsecurity(struct inode *inode, const 
> char *name, void **buffer,
>   fs_ns = inode->i_sb->s_user_ns;
>   cap = (struct vfs_cap_data *) tmpbuf;
>   if (is_v2header((size_t) ret, cap)) {
> - /* If this is sizeof(vfs_cap_data) then we're ok with the
> -  * on-disk value, so return that.  */
> - if (alloc)
> - *buffer = tmpbuf;
> - else
> - kfree(tmpbuf);
> - return ret;
> - } else if (!is_v3header((size_t) ret, cap)) {
> - kfree(tmpbuf);
> - return -EINVAL;
> + root = 0;
> + } else if (is_v3header((size_t) ret, cap)) {
> + nscap = (struct vfs_ns_cap_data *) tmpbuf;
> + root = le32_to_cpu(nscap->rootid);
> + } else {
> + size = -EINVAL;
> + goto out_free;
>   }
>  
> - nscap = (struct vfs_ns_cap_data *) tmpbuf;
> - root = le32_to_cpu(nscap->rootid);
>   kroot = make_kuid(fs_ns, root);
>  
>   /* If the root kuid maps to a valid uid in current ns, then return
>* this as a nscap. */
>   mappedroot = from_kuid(current_user_ns(), kroot);
>   if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
> + size = sizeof(struct vfs_ns_cap_data);
>   if (alloc) {
> - *buffer = tmpbuf;
> + if (!nscap) {
> + /* v2 -> v3 conversion */
> + nscap = kzalloc(size, GFP_ATOMIC);
> + if (!nscap) {
> + size = -ENOMEM;
> + goto out_free;
> + }
> + nsmagic = VFS_CAP_REVISION_3;
> + magic = le32_to_cpu(cap->magic_etc);
> + if (magic & VFS_CAP_FLAGS_EFFECTIVE)
> + nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
> + memcpy(&nscap->data, &cap->data, sizeof(__le32) 
> * 2 * VFS_CAP_U32);
> + nscap->magic_etc = cpu_to_le32(nsmagic);
> + } else {
> + /* use allocated v3 buffer */
> + tmpbuf = NULL;
> + }
>   nscap->rootid = cpu_to_le32(mappedroot);
> - } else
> - kfree(tmpbuf);
> - return size;
> + *buffer = nscap;
> + }
> + goto out_free;
>   }
>  
>   if (!rootid_owns_currentns(kroot)) {
> - kfree(tmpbuf);
> - return -EOPNOTSUPP;
> + size = -EOVERFLOW;
> + goto out_free;
>   }
>  
>   /* This comes from a parent namespace.  Return as a v2 capability */
>   size = sizeof(struct vfs_cap_da

Re: [PATCH 0/2] capability conversion fixes

2021-01-19 Thread Eric W. Biederman
Miklos Szeredi  writes:

> It turns out overlayfs is actually okay wrt. mutliple conversions, because
> it uses the right context for lower operations.  I.e. before calling
> vfs_{set,get}xattr() on underlying fs, it overrides creds with that of the
> mounter, so the current user ns will now match that of
> overlay_sb->s_user_ns, meaning that the caps will be converted to just the
> right format for the next layer
>
> OTOH ecryptfs, which is the only other one affected by commit 7c03e2cda4a5
> ("vfs: move cap_convert_nscap() call into vfs_setxattr()") needs to be
> fixed up, since it doesn't do the cap override thing that overlayfs does.
>
> I don't have an ecryptfs setup, so untested, but it's a fairly trivial
> change.
>
> My other observation was that cap_inode_getsecurity() messes up conversion
> of caps in more than one case.  This is independent of the overlayfs user
> ns enablement but affects it as well.
>
> Maybe we can revisit the infrastructure improvements we discussed, but I
> think these fixes are more appropriate for the current cycle.

I mostly agree.  Fixing the bugs in a back-portable way is important.

However we need to sort out the infrastructure, and implementation.

As far as I can tell it is only the fact that overlayfs does not support
the new mount api aka fs_context that allows this fix to work and be
correct.

I believe the new mount api would allow specifying a different userns
thatn curent_user_ns for the overlay filesystem and that would break
this.

So while I agree with the making a minimal fix for now.  We need a good
fix because this code is much too subtle, and it can break very easily
with no one noticing.

Eric





> Thanks,
> Miklos
>
> Miklos Szeredi (2):
>   ecryptfs: fix uid translation for setxattr on security.capability
>   security.capability: fix conversions on getxattr
>
>  fs/ecryptfs/inode.c  | 10 +--
>  security/commoncap.c | 67 
>  2 files changed, 50 insertions(+), 27 deletions(-)


Re: [PATCH 1/2] ecryptfs: fix uid translation for setxattr on security.capability

2021-01-19 Thread Eric W. Biederman
Miklos Szeredi  writes:

> Prior to commit 7c03e2cda4a5 ("vfs: move cap_convert_nscap() call into
> vfs_setxattr()") the translation of nscap->rootid did not take stacked
> filesystems (overlayfs and ecryptfs) into account.
>
> That patch fixed the overlay case, but made the ecryptfs case worse.
>
> Restore old the behavior for ecryptfs that existed before the overlayfs
> fix.  This does not fix ecryptfs's handling of complex user namespace
> setups, but it does make sure existing setups don't regress.

Today vfs_setxattr handles handles a delegated_inode and breaking
leases.  Code that is enabled with CONFIG_FILE_LOCKING.  So unless
I am missing something this introduces a different regression into
ecryptfs.

>
> Reported-by: Eric W. Biederman 
> Cc: Tyler Hicks 
> Fixes: 7c03e2cda4a5 ("vfs: move cap_convert_nscap() call into vfs_setxattr()")
> Signed-off-by: Miklos Szeredi 
> ---
>  fs/ecryptfs/inode.c | 10 +++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
> index e23752d9a79f..58d0f7187997 100644
> --- a/fs/ecryptfs/inode.c
> +++ b/fs/ecryptfs/inode.c
> @@ -1016,15 +1016,19 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode 
> *inode,
>  {
>   int rc;
>   struct dentry *lower_dentry;
> + struct inode *lower_inode;
>  
>   lower_dentry = ecryptfs_dentry_to_lower(dentry);
> - if (!(d_inode(lower_dentry)->i_opflags & IOP_XATTR)) {
> + lower_inode = d_inode(lower_dentry);
> + if (!(lower_inode->i_opflags & IOP_XATTR)) {
>   rc = -EOPNOTSUPP;
>   goto out;
>   }
> - rc = vfs_setxattr(lower_dentry, name, value, size, flags);
> + inode_lock(lower_inode);
> + rc = __vfs_setxattr_locked(lower_dentry, name, value, size, flags, 
> NULL);
> + inode_unlock(lower_inode);
>   if (!rc && inode)
> - fsstack_copy_attr_all(inode, d_inode(lower_dentry));
> + fsstack_copy_attr_all(inode, lower_inode);
>  out:
>   return rc;
>  }

Eric


[PATCH] rename lpfc_sli4_dump_page_a0 to lpfc_sli4_dump_sfp_pagea0

2021-01-19 Thread Eric Curtin
Comment did not match function signature.

Signed-off-by: Eric Curtin 
---
 drivers/scsi/lpfc/lpfc_mbox.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/lpfc/lpfc_mbox.c b/drivers/scsi/lpfc/lpfc_mbox.c
index 3414ffcb26fe..c03a7f12dd65 100644
--- a/drivers/scsi/lpfc/lpfc_mbox.c
+++ b/drivers/scsi/lpfc/lpfc_mbox.c
@@ -2409,7 +2409,7 @@ lpfc_mbx_cmpl_rdp_page_a0(struct lpfc_hba *phba, 
LPFC_MBOXQ_t *mbox)
 
 
 /*
- * lpfc_sli4_dump_sfp_pagea0 - Dump sli4 read SFP Diagnostic.
+ * lpfc_sli4_dump_page_a0 - Dump sli4 read SFP Diagnostic.
  * @phba: pointer to the hba structure containing.
  * @mbox: pointer to lpfc mbox command to initialize.
  *
-- 
2.25.1



Re: [PATCH RFC v1 00/15] iommu/virtio: Nested stage support with Arm

2021-01-19 Thread Auger Eric
Hi Vivek,

On 1/15/21 1:13 PM, Vivek Gautam wrote:
> This patch-series aims at enabling Nested stage translation in guests
> using virtio-iommu as the paravirtualized iommu. The backend is supported
> with Arm SMMU-v3 that provides nested stage-1 and stage-2 translation.
> 
> This series derives its purpose from various efforts happening to add
> support for Shared Virtual Addressing (SVA) in host and guest. On Arm,
> most of the support for SVA has already landed. The support for nested
> stage translation and fault reporting to guest has been proposed [1].
> The related changes required in VFIO [2] framework have also been put
> forward.
> 
> This series proposes changes in virtio-iommu to program PASID tables
> and related stage-1 page tables. A simple iommu-pasid-table library
> is added for this purpose that interacts with vendor drivers to
> allocate and populate PASID tables.
> In Arm SMMUv3 we propose to pull the Context Descriptor (CD) management
> code out of the arm-smmu-v3 driver and add that as a glue vendor layer
> to support allocating CD tables, and populating them with right values.
> These CD tables are essentially the PASID tables and contain stage-1
> page table configurations too.
> A request to setup these CD tables come from virtio-iommu driver using
> the iommu-pasid-table library when running on Arm. The virtio-iommu
> then pass these PASID tables to the host using the right virtio backend
> and support in VMM.
> 
> For testing we have added necessary support in kvmtool. The changes in
> kvmtool are based on virtio-iommu development branch by Jean-Philippe
> Brucker [3].
> 
> The tested kernel branch contains following in the order bottom to top
> on the git hash -
> a) v5.11-rc3
> b) arm-smmu-v3 [1] and vfio [2] changes from Eric to add nested page
>table support for Arm.
> c) Smmu test engine patches from Jean-Philippe's branch [4]
> d) This series
> e) Domain nesting info patches [5][6][7].
> f) Changes to add arm-smmu-v3 specific nesting info (to be sent to
>the list).
> 
> This kernel is tested on Neoverse reference software stack with
> Fixed virtual platform. Public version of the software stack and
> FVP is available here[8][9].
> 
> A big thanks to Jean-Philippe for his contributions towards this work
> and for his valuable guidance.
> 
> [1] 
> https://lore.kernel.org/linux-iommu/20201118112151.25412-1-eric.au...@redhat.com/T/
> [2] 
> https://lore.kernel.org/kvmarm/20201116110030.32335-12-eric.au...@redhat.com/T/
> [3] https://jpbrucker.net/git/kvmtool/log/?h=virtio-iommu/devel
> [4] https://jpbrucker.net/git/linux/log/?h=sva/smmute
> [5] 
> https://lore.kernel.org/kvm/1599734733-6431-2-git-send-email-yi.l@intel.com/
> [6] 
> https://lore.kernel.org/kvm/1599734733-6431-3-git-send-email-yi.l@intel.com/
> [7] 
> https://lore.kernel.org/kvm/1599734733-6431-4-git-send-email-yi.l@intel.com/
> [8] 
> https://developer.arm.com/tools-and-software/open-source-software/arm-platforms-software/arm-ecosystem-fvps
> [9] 
> https://git.linaro.org/landing-teams/working/arm/arm-reference-platforms.git/about/docs/rdn1edge/user-guide.rst

Could you share a public branch where we could find all the kernel pieces.

Thank you in advance

Best Regards

Eric
> 
> Jean-Philippe Brucker (6):
>   iommu/virtio: Add headers for table format probing
>   iommu/virtio: Add table format probing
>   iommu/virtio: Add headers for binding pasid table in iommu
>   iommu/virtio: Add support for INVALIDATE request
>   iommu/virtio: Attach Arm PASID tables when available
>   iommu/virtio: Add support for Arm LPAE page table format
> 
> Vivek Gautam (9):
>   iommu/arm-smmu-v3: Create a Context Descriptor library
>   iommu: Add a simple PASID table library
>   iommu/arm-smmu-v3: Update drivers to work with iommu-pasid-table
>   iommu/arm-smmu-v3: Update CD base address info for user-space
>   iommu/arm-smmu-v3: Set sync op from consumer driver of cd-lib
>   iommu: Add asid_bits to arm smmu-v3 stage1 table info
>   iommu/virtio: Update table format probing header
>   iommu/virtio: Prepare to add attach pasid table infrastructure
>   iommu/virtio: Update fault type and reason info for viommu fault
> 
>  drivers/iommu/arm/arm-smmu-v3/Makefile|   2 +-
>  .../arm/arm-smmu-v3/arm-smmu-v3-cd-lib.c  | 283 +++
>  .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |  16 +-
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 268 +--
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |   4 +-
>  drivers/iommu/iommu-pasid-table.h | 140 
>  drivers/iommu/virtio-iommu.c  | 692 +-
>  include/uapi/linux/iommu.h|   2 +-
>  include/uapi/linux/virtio_iommu.h | 158 +++-
>  9 files changed, 1303 insertions(+), 262 deletions(-)
>  create mode 100644 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-cd-lib.c
>  create mode 100644 drivers/iommu/iommu-pasid-table.h
> 



Re: [RFC V1 3/7] crypto: ghash - Optimized GHASH computations

2021-01-15 Thread Eric Biggers
On Fri, Jan 15, 2021 at 04:20:44PM -0800, Dave Hansen wrote:
> On 1/15/21 4:14 PM, Dey, Megha wrote:
> > Also, I do not know of any cores that implement PCLMULQDQ and not AES-NI.
> 
> That's true, bit it's also possible that a hypervisor could enumerate
> support for PCLMULQDQ and not AES-NI.  In general, we've tried to
> implement x86 CPU features independently, even if they never show up in
> a real CPU independently.

We only add optimized implementations of crypto algorithms if they are actually
useful, though.  If they would never be used in practice, that's not useful.

- Eric


Re: [RFC V1 3/7] crypto: ghash - Optimized GHASH computations

2021-01-15 Thread Eric Biggers
On Fri, Jan 15, 2021 at 04:14:40PM -0800, Dey, Megha wrote:
> > Hello Megha,
> > 
> > What is the purpose of this separate GHASH module? GHASH is only used
> > in combination with AES-CTR to produce GCM, and this series already
> > contains a GCM driver.
> > 
> > Do cores exist that implement PCLMULQDQ but not AES-NI?
> > 
> > If not, I think we should be able to drop this patch (and remove the
> > existing PCLMULQDQ GHASH driver as well)
> 
> AFAIK, dm-verity (authenticated but not encrypted file system) is one use
> case for authentication only.
> 
> Although I am not sure if GHASH is specifically used for this or SHA?
> 
> Also, I do not know of any cores that implement PCLMULQDQ and not AES-NI.
> 

dm-verity only uses unkeyed hash algorithms.  So no, it doesn't use GHASH.

- Eric


Re: [PATCH v4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-15 Thread Eric Snowberg


> On Jan 15, 2021, at 10:21 AM, James Bottomley 
>  wrote:
> 
> On Tue, 2020-09-15 at 20:49 -0400, Eric Snowberg wrote:
>> The Secure Boot Forbidden Signature Database, dbx, contains a list of
>> now revoked signatures and keys previously approved to boot with UEFI
>> Secure Boot enabled.  The dbx is capable of containing any number of
>> EFI_CERT_X509_SHA256_GUID, EFI_CERT_SHA256_GUID, and
>> EFI_CERT_X509_GUID entries.
>> 
>> Currently when EFI_CERT_X509_GUID are contained in the dbx, the
>> entries are skipped.
>> 
>> Add support for EFI_CERT_X509_GUID dbx entries. When a
>> EFI_CERT_X509_GUID is found, it is added as an asymmetrical key to
>> the .blacklist keyring. Anytime the .platform keyring is used, the
>> keys in the .blacklist keyring are referenced, if a matching key is
>> found, the key will be rejected.
>> 
>> Signed-off-by: Eric Snowberg 
> 
> If you're using shim, as most of our users are, you have no access to
> dbx to blacklist certificates.  Plus our security envelope includes the
> Mok variables, so you should also be paying attestion to MokListX (or
> it's RT equivalent: MokListXRT).
> 
> If you add this to the patch, we get something that is mechanistically
> complete and which also allows users to add certs to their Mok
> blacklist.

That make sense. I’ll work on a patch to add this ability.



Re: [PATCH v4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-15 Thread Eric Snowberg


> On Jan 15, 2021, at 2:15 AM, Jarkko Sakkinen  wrote:
> 
> On Wed, Jan 13, 2021 at 05:11:10PM -0700, Eric Snowberg wrote:
>> 
>>> On Jan 13, 2021, at 1:41 PM, Jarkko Sakkinen 
>>>  wrote:
>>> 
>>> On Tue, Jan 12, 2021 at 02:57:39PM +, David Howells wrote:
>>>> Eric Snowberg  wrote:
>>>> 
>>>>>> On Dec 10, 2020, at 2:49 AM, David Howells  wrote:
>>>>>> 
>>>>>> Eric Snowberg  wrote:
>>>>>> 
>>>>>>> Add support for EFI_CERT_X509_GUID dbx entries. When a 
>>>>>>> EFI_CERT_X509_GUID
>>>>>>> is found, it is added as an asymmetrical key to the .blacklist keyring.
>>>>>>> Anytime the .platform keyring is used, the keys in the .blacklist 
>>>>>>> keyring
>>>>>>> are referenced, if a matching key is found, the key will be rejected.
>>>>>> 
>>>>>> Ummm...  Why this way and not as a blacklist key which takes up less 
>>>>>> space?
>>>>>> I'm guessing that you're using the key chain matching logic.  We really 
>>>>>> only
>>>>>> need to blacklist the key IDs.
>>>>> 
>>>>> I implemented it this way so that certs in the dbx would only impact 
>>>>> the .platform keyring. I was under the impression we didn’t want to have 
>>>>> Secure Boot UEFI db/dbx certs dictate keyring functionality within the 
>>>>> kernel
>>>>> itself. Meaning if we have a matching dbx cert in any other keyring 
>>>>> (builtin,
>>>>> secondary, ima, etc.), it would be allowed. If that is not how you’d like 
>>>>> to 
>>>>> see it done, let me know and I’ll make the change.
>>>> 
>>>> I wonder if that is that the right thing to do.  I guess this is a policy
>>>> decision and may depend on the particular user.
>>> 
>>> Why would you want to allow dbx entry in any keyring?
>> 
>> Today, DB and MOK certs go into the platform keyring.  These certs are only
>> referenced during kexec.  They can’t be used for other things like validating
>> kernel module signatures.  If we follow the same pattern, the DBX and MOKX 
>> entries
>> in the blacklist keyring should only impact kexec. 
>> 
>> Currently, Mickaël Salaün has another outstanding series to allow root to 
>> update 
>> the blacklist keyring.  I assume the use case for this is around 
>> certificates used 
>> within the kernel, for example revoking kernel module signatures.  The 
>> question I have
>> is, should another keyring be introduced?  One that carries DBX and MOKX, 
>> which just
>> correspond to certs/hashes in the platform keyring; this keyring would only 
>> be
>> referenced for kexec, just like the platform keyring is today. Then, the 
>> current
>> blacklist keyring would be used for everything internal to the kernel.
> 
> Right, I'm following actively that series.
> 
> Why couldn't user space drive this process and use that feature to do it?

I could see where the user would want to use both. With Mickaël Salaün’s
series, the blacklist keyring is updated immediately.  However it does
not survive a reboot.  With my patch, the blacklist keyring is updated
during boot, based on what is in the dbx. Neither approach needs a new 
kernel build.



Re: [PATCH net] skbuff: back tiny skbs with kmalloc() in __netdev_alloc_skb() too

2021-01-15 Thread Eric Dumazet
On Fri, Jan 15, 2021 at 12:55 AM Alexander Lobakin  wrote:
>
> Commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for
> tiny skbs") ensured that skbs with data size lower than 1025 bytes
> will be kmalloc'ed to avoid excessive page cache fragmentation and
> memory consumption.
> However, the same issue can still be achieved manually via
> __netdev_alloc_skb(), where the check for size hasn't been changed.
> Mirror the condition from __napi_alloc_skb() to prevent from that.
>
> Fixes: 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny 
> skbs")

No, this tag is wrong, if you fix a bug, bug is much older than linux-5.11

My fix was about GRO head and virtio_net heads, both using pre-sized
small buffers.

You want to fix something else, and this is fine, because some drivers
are unfortunately
doing copy break ( at the cost of additional copy, even for packets
that might be consumed right away)


Re: cBPF socket filters failing - inexplicably?

2021-01-15 Thread Eric Dumazet
On Fri, Jan 15, 2021 at 7:52 AM Alexei Starovoitov
 wrote:
>
> Adding appropriate mailing list to cc...
>
> My wild guess is that as soon as socket got created:
> socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
> the packets were already queued to it.
> So later setsockopt() is too late to filter.
>
> Eric, thoughts?

Exactly, this is what happens.

I do not know how tcpdump and other programs deal with this.

Maybe by setting a small buffer size, or draining the queue.

>
> On Wed, Jan 6, 2021 at 6:55 AM Tom Cook  wrote:
> >
> > Another factoid to add to this:  I captured all traffic on an
> > interface while the test program was running using
> >
> > tcpdump -i wlo1 -w capture.pcap
> >
> > observing that multiple packets got through the filter.  I then built
> > the bpf_dbg program from the kernel source tree and ran the same
> > filter and capture file through it:
> >
> > $ tools/bpf_dbg
> > > load bpf 1,6 0 0 0
> > > load pcap capture.pcap
> > > run
> > bpf passes:0 fails:269288
> >
> > So bpf_dbg thinks the filter is correct; it's only when the filter is
> > attached to an actual socket that it fails occasionally.
> >
> > Regards,
> > Tom
> >
> > On Wed, Jan 6, 2021 at 10:07 AM Tom Cook  wrote:
> > >
> > > Just to note I have also reproduced this on a 5.10.0 kernel.
> > >
> > > On Tue, Jan 5, 2021 at 1:42 PM Tom Cook  wrote:
> > > >
> > > > In the course of tracking down a defect in some existing software,
> > > > I've found the failure demonstrated by the short program below.
> > > > Essentially, a cBPF program that just rejects every frame (ie always
> > > > returns zero) and is attached to a socket using setsockopt(SOL_SOCKET,
> > > > SO_ATTACH_FILTER, ...) still occasionally lets frames through to
> > > > userspace.
> > > >
> > > > The code is based on the first example in
> > > > Documentation/networking/filter.txt, except that I've changed the
> > > > content of the filter program and added a timeout on the socket.
> > > >
> > > > To reproduce the problem:
> > > >
> > > > # gcc test.c -o test
> > > > # sudo ./test
> > > > ... and in another console start a large network operation.
> > > >
> > > > In my case, I copied a ~300MB core file I had lying around to another
> > > > host on the LAN.  The test code should print the string "Failed to
> > > > read from socket" 100 times.  In practice, it produces about 10%
> > > > "Received packet with ethertype..." messages.
> > > >
> > > > I've observed the same result on Ubuntu amd64 glibc system running a
> > > > 5.9.0 kernel and also on Alpine arm64v8 muslc system running a 4.9.1
> > > > kernel.  I've written test code in both C and Python.  I'm fairly sure
> > > > this is not something I'm doing wrong - but very keen to have things
> > > > thrown at me if it is.
> > > >
> > > > Regards,
> > > > Tom Cook
> > > >
> > > >
> > > > #include 
> > > > #include 
> > > > #include 
> > > > #include 
> > > > #include 
> > > > #include 
> > > > #include 
> > > > #include 
> > > >
> > > > struct sock_filter code[] = {
> > > > { 0x06,0,0,0x00 }  /* BPF_RET | BPF_K   0   0   0 */
> > > > };
> > > >
> > > > struct sock_fprog bpf = {
> > > > .len = 1,
> > > > .filter = code,
> > > > };
> > > >
> > > > void test() {
> > > > uint8_t buf[2048];
> > > >
> > > > int sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
> > > > if (sock < 0) {
> > > > printf("Failed to open socket\n");
> > > > return;
> > > > }
> > > > int ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, 
> > > > sizeof(bpf));
> > > > if (ret < 0) {
> > > > printf("Failed to set socket filter\n");
> > > > return;
> > > > }
> > > > struct timeval tv = {
> > > > .tv_sec = 1
> > > > };
> > > >
> > > > ret = setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
> > > > if (ret < 0) {
> > > > printf("Failed to set socket timeout\n");
> > > > return;
> > > > }
> > > >
> > > > ssize_t count = recv(sock, buf, 2048, 0);
> > > > if (count <= 0) {
> > > > printf("Failed to read from socket\n");
> > > > return;
> > > > }
> > > >
> > > > close(sock);
> > > >
> > > > uint16_t *ethertype = (short*)(buf + 12);
> > > > uint8_t *proto = (unsigned char *)(buf + 23);
> > > > uint16_t *dport = (uint16_t *)(buf + 14 + 20);
> > > >
> > > > printf("Received packet with ethertype 0x%04hu, protocol 0x%02hhu
> > > > and dport 0x%04hu\n", *ethertype, *proto, *dport);
> > > > }
> > > >
> > > > int main() {
> > > > for (size_t ii = 0; ii < 100; ++ii) {
> > > > test();
> > > > }
> > > > }


Re: [RFC v3 1/2] vfio/platform: add support for msi

2021-01-15 Thread Auger Eric
Hi Vikas,

On 1/15/21 7:26 AM, Vikas Gupta wrote:
> Hi Eric,
> 
> On Tue, Jan 12, 2021 at 2:30 PM Auger Eric  wrote:
>>
>> Hi Vikas,
>>
>> On 1/5/21 6:53 AM, Vikas Gupta wrote:
>>> On Tue, Dec 22, 2020 at 10:57 PM Auger Eric  wrote:
>>>>
>>>> Hi Vikas,
>>>>
>>>> On 12/14/20 6:45 PM, Vikas Gupta wrote:
>>>>> MSI support for platform devices.The MSI block
>>>>> is added as an extended IRQ which exports caps
>>>>> VFIO_IRQ_INFO_CAP_TYPE and VFIO_IRQ_INFO_CAP_MSI_DESCS.
>>>>>
>>>>> Signed-off-by: Vikas Gupta 
>>>>> ---
>>>>>  drivers/vfio/platform/vfio_platform_common.c  | 179 +++-
>>>>>  drivers/vfio/platform/vfio_platform_irq.c | 260 +-
>>>>>  drivers/vfio/platform/vfio_platform_private.h |  32 +++
>>>>>  include/uapi/linux/vfio.h |  44 +++
>>>>>  4 files changed, 496 insertions(+), 19 deletions(-)
>>>>>
>>>>> diff --git a/drivers/vfio/platform/vfio_platform_common.c 
>>>>> b/drivers/vfio/platform/vfio_platform_common.c
>>>>> index fb4b385191f2..c936852f35d7 100644
>>>>> --- a/drivers/vfio/platform/vfio_platform_common.c
>>>>> +++ b/drivers/vfio/platform/vfio_platform_common.c
>>>>> @@ -16,6 +16,7 @@
>>>>>  #include 
>>>>>  #include 
>>>>>  #include 
>>>>> +#include 
>>>>>
>>>>>  #include "vfio_platform_private.h"
>>>>>
>>>>> @@ -26,6 +27,8 @@
>>>>>  #define VFIO_PLATFORM_IS_ACPI(vdev) ((vdev)->acpihid != NULL)
>>>>>
>>>>>  static LIST_HEAD(reset_list);
>>>>> +/* devices having MSI support */
>>>> nit: for devices using MSIs?
>>>>> +static LIST_HEAD(msi_list);
>>>>>  static DEFINE_MUTEX(driver_lock);
>>>>>
>>>>>  static vfio_platform_reset_fn_t vfio_platform_lookup_reset(const char 
>>>>> *compat,
>>>>> @@ -47,6 +50,25 @@ static vfio_platform_reset_fn_t 
>>>>> vfio_platform_lookup_reset(const char *compat,
>>>>>   return reset_fn;
>>>>>  }
>>>>>
>>>>> +static bool vfio_platform_lookup_msi(struct vfio_platform_device *vdev)
>>>>> +{
>>>>> + bool has_msi = false;
>>>>> + struct vfio_platform_msi_node *iter;
>>>>> +
>>>>> + mutex_lock(&driver_lock);
>>>>> + list_for_each_entry(iter, &msi_list, link) {
>>>>> + if (!strcmp(iter->compat, vdev->compat) &&
>>>>> + try_module_get(iter->owner)) {
>>>>> + vdev->msi_module = iter->owner;
>>>>> + vdev->of_get_msi = iter->of_get_msi;
>>>>> + has_msi = true;
>>>>> + break;
>>>>> + }
>>>>> + }
>>>>> + mutex_unlock(&driver_lock);
>>>>> + return has_msi;
>>>>> +}
>>>>> +
>>>>>  static int vfio_platform_acpi_probe(struct vfio_platform_device *vdev,
>>>>>   struct device *dev)
>>>>>  {
>>>>> @@ -126,6 +148,19 @@ static int vfio_platform_get_reset(struct 
>>>>> vfio_platform_device *vdev)
>>>>>   return vdev->of_reset ? 0 : -ENOENT;
>>>>>  }
>>>>>
>>>>> +static int vfio_platform_get_msi(struct vfio_platform_device *vdev)
>>>>> +{
>>>>> + bool has_msi;
>>>>> +
>>>>> + has_msi = vfio_platform_lookup_msi(vdev);
>>>>> + if (!has_msi) {
>>>>> + request_module("vfio-msi:%s", vdev->compat);
>>>>> + has_msi = vfio_platform_lookup_msi(vdev);
>>>>> + }
>>>>> +
>>>>> + return has_msi ? 0 : -ENOENT;
>>>>> +}
>>>>> +
>>>>>  static void vfio_platform_put_reset(struct vfio_platform_device *vdev)
>>>>>  {
>>>>>   if (VFIO_PLATFORM_IS_ACPI(vdev))
>>>>> @@ -135,6 +170,12 @@ static void vfio_platform_put_reset(struct 
>>>>> vfio_platform_device *vdev)
>>>>>   module_

Re: [RFC v3 2/2] vfio/platform: msi: add Broadcom platform devices

2021-01-15 Thread Auger Eric
Hi Vikas,
On 1/15/21 7:35 AM, Vikas Gupta wrote:
> Hi Eric,
> 
> On Tue, Jan 12, 2021 at 2:52 PM Auger Eric  wrote:
>>
>> Hi Vikas,
>>
>> On 12/14/20 6:45 PM, Vikas Gupta wrote:
>>> Add msi support for Broadcom platform devices
>>>
>>> Signed-off-by: Vikas Gupta 
>>> ---
>>>  drivers/vfio/platform/Kconfig |  1 +
>>>  drivers/vfio/platform/Makefile|  1 +
>>>  drivers/vfio/platform/msi/Kconfig |  9 
>>>  drivers/vfio/platform/msi/Makefile|  2 +
>>>  .../vfio/platform/msi/vfio_platform_bcmplt.c  | 49 +++
>>>  5 files changed, 62 insertions(+)
>>>  create mode 100644 drivers/vfio/platform/msi/Kconfig
>>>  create mode 100644 drivers/vfio/platform/msi/Makefile
>>>  create mode 100644 drivers/vfio/platform/msi/vfio_platform_bcmplt.c
>> what does plt mean?
> This(plt) is a generic name for Broadcom platform devices, which we`ll
>  plan to add in this file. Currently we have only one in this file.
> Do you think this name does not sound good here?

we have VFIO_PLATFORM_BCMFLEXRM_RESET config which also applied to vfio
flex-rm platform device.

I think it would be more homegenous to have VFIO_PLATFORM_BCMFLEXRM_MSI
in case we keep a separate msi module.

also in reset dir we have vfio_platform_bcmflexrm.c


>>>
>>> diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
>>> index dc1a3c44f2c6..7b8696febe61 100644
>>> --- a/drivers/vfio/platform/Kconfig
>>> +++ b/drivers/vfio/platform/Kconfig
>>> @@ -21,3 +21,4 @@ config VFIO_AMBA
>>> If you don't know what to do here, say N.
>>>
>>>  source "drivers/vfio/platform/reset/Kconfig"
>>> +source "drivers/vfio/platform/msi/Kconfig"
>>> diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
>>> index 3f3a24e7c4ef..9ccdcdbf0e7e 100644
>>> --- a/drivers/vfio/platform/Makefile
>>> +++ b/drivers/vfio/platform/Makefile
>>> @@ -5,6 +5,7 @@ vfio-platform-y := vfio_platform.o
>>>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
>>>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform-base.o
>>>  obj-$(CONFIG_VFIO_PLATFORM) += reset/
>>> +obj-$(CONFIG_VFIO_PLATFORM) += msi/
>>>
>>>  vfio-amba-y := vfio_amba.o
>>>
>>> diff --git a/drivers/vfio/platform/msi/Kconfig 
>>> b/drivers/vfio/platform/msi/Kconfig
>>> new file mode 100644
>>> index ..54d6b70e1e32
>>> --- /dev/null
>>> +++ b/drivers/vfio/platform/msi/Kconfig
>>> @@ -0,0 +1,9 @@
>>> +# SPDX-License-Identifier: GPL-2.0-only
>>> +config VFIO_PLATFORM_BCMPLT_MSI
>>> + tristate "MSI support for Broadcom platform devices"
>>> + depends on VFIO_PLATFORM && (ARCH_BCM_IPROC || COMPILE_TEST)
>>> + default ARCH_BCM_IPROC
>>> + help
>>> +   Enables the VFIO platform driver to handle msi for Broadcom devices
>>> +
>>> +   If you don't know what to do here, say N.
>>> diff --git a/drivers/vfio/platform/msi/Makefile 
>>> b/drivers/vfio/platform/msi/Makefile
>>> new file mode 100644
>>> index ..27422d45cecb
>>> --- /dev/null
>>> +++ b/drivers/vfio/platform/msi/Makefile
>>> @@ -0,0 +1,2 @@
>>> +# SPDX-License-Identifier: GPL-2.0
>>> +obj-$(CONFIG_VFIO_PLATFORM_BCMPLT_MSI) += vfio_platform_bcmplt.o
>>> diff --git a/drivers/vfio/platform/msi/vfio_platform_bcmplt.c 
>>> b/drivers/vfio/platform/msi/vfio_platform_bcmplt.c
>>> new file mode 100644
>>> index ..a074b5e92d77
>>> --- /dev/null
>>> +++ b/drivers/vfio/platform/msi/vfio_platform_bcmplt.c
>>> @@ -0,0 +1,49 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +/*
>>> + * Copyright 2020 Broadcom.
>>> + */
>>> +
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +
>>> +#include "../vfio_platform_private.h"
>>> +
>>> +#define RING_SIZE(64 << 10)
>>> +
>>> +#define RING_MSI_ADDR_LS 0x03c
>>> +#define RING_MSI_ADDR_MS 0x040
>>> +#define RING_MSI_DATA_VALUE  0x064
>> Those 3 defines would not be needed anymore with that implementation option.
>>> +
>>> +static u32 bcm_num_msi(struct vfio_platform_device *vdev)
>>> +{
>>> + struct vfio_platform_region *reg = &vdev->reg

Re: [PATCH v13 07/15] iommu/smmuv3: Allow stage 1 invalidation with unmanaged ASIDs

2021-01-14 Thread Auger Eric
Hi Jean,

On 1/14/21 6:33 PM, Jean-Philippe Brucker wrote:
> Hi Eric,
> 
> On Thu, Jan 14, 2021 at 05:58:27PM +0100, Auger Eric wrote:
>>>>  The uacce-devel branches from
>>>>> https://github.com/Linaro/linux-kernel-uadk do provide this at the moment
>>>>> (they track the latest sva/zip-devel branch
>>>>> https://jpbrucker.net/git/linux/ which is roughly based on mainline.)
>> As I plan to respin shortly, please could you confirm the best branch to
>> rebase on still is that one (uacce-devel from the linux-kernel-uadk git
>> repo). Is it up to date? Commits seem to be quite old there.
> 
> Right I meant the uacce-devel-X branches. The uacce-devel-5.11 branch
> currently has the latest patches

OK thanks!

Eric
> 
> Thanks,
> Jean
> 



Re: [PATCH v13 07/15] iommu/smmuv3: Allow stage 1 invalidation with unmanaged ASIDs

2021-01-14 Thread Auger Eric
Hi Shameer, Jean-Philippe,

On 12/4/20 11:23 AM, Auger Eric wrote:
> Hi Shameer, Jean-Philippe,
> 
> On 12/4/20 11:20 AM, Shameerali Kolothum Thodi wrote:
>> Hi Jean,
>>
>>> -Original Message-
>>> From: Jean-Philippe Brucker [mailto:jean-phili...@linaro.org]
>>> Sent: 04 December 2020 09:54
>>> To: Shameerali Kolothum Thodi 
>>> Cc: Auger Eric ; wangxingang
>>> ; Xieyingtai ;
>>> k...@vger.kernel.org; m...@kernel.org; j...@8bytes.org; w...@kernel.org;
>>> io...@lists.linux-foundation.org; linux-kernel@vger.kernel.org;
>>> vivek.gau...@arm.com; alex.william...@redhat.com;
>>> zhangfei@linaro.org; robin.mur...@arm.com;
>>> kvm...@lists.cs.columbia.edu; eric.auger@gmail.com; Zengtao (B)
>>> ; qubingbing 
>>> Subject: Re: [PATCH v13 07/15] iommu/smmuv3: Allow stage 1 invalidation with
>>> unmanaged ASIDs
>>>
>>> Hi Shameer,
>>>
>>> On Thu, Dec 03, 2020 at 06:42:57PM +, Shameerali Kolothum Thodi wrote:
>>>> Hi Jean/zhangfei,
>>>> Is it possible to have a branch with minimum required SVA/UACCE related
>>> patches
>>>> that are already public and can be a "stable" candidate for future respin 
>>>> of
>>> Eric's series?
>>>> Please share your thoughts.
>>>
>>> By "stable" you mean a fixed branch with the latest SVA/UACCE patches
>>> based on mainline? 
>>
>> Yes. 
>>
>>  The uacce-devel branches from
>>> https://github.com/Linaro/linux-kernel-uadk do provide this at the moment
>>> (they track the latest sva/zip-devel branch
>>> https://jpbrucker.net/git/linux/ which is roughly based on mainline.)
As I plan to respin shortly, please could you confirm the best branch to
rebase on still is that one (uacce-devel from the linux-kernel-uadk git
repo). Is it up to date? Commits seem to be quite old there.

Thanks

Eric
>>
>> Thanks. 
>>
>> Hi Eric,
>>
>> Could you please take a look at the above branches and see whether it make 
>> sense
>> to rebase on top of either of those?
>>
>> From vSVA point of view, it will be less rebase hassle if we can do that.
> 
> Sure. I will rebase on top of this ;-)
> 
> Thanks
> 
> Eric
>>
>> Thanks,
>> Shameer
>>
>>> Thanks,
>>> Jean
>>
> 
> ___
> iommu mailing list
> io...@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
> 



[PATCH v2 5/9] KVM: arm: move has_run_once after the map_resources

2021-01-14 Thread Eric Auger
has_run_once is set to true at the beginning of
kvm_vcpu_first_run_init(). This generally is not an issue
except when exercising the code with KVM selftests. for instance,
if kvm_vgic_map_resources() fails due to erroneous user settings,
has_run_once is set and this prevents from continuing
executing the test. This patch moves the assignment after the
kvm_vgic_map_resources().

Signed-off-by: Eric Auger 

---

v1 -> v2:
- slight reword of the commit msg (for instance)
---
 arch/arm64/kvm/arm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 04c44853b103..580760e58980 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -573,8 +573,6 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
if (!kvm_arm_vcpu_is_finalized(vcpu))
return -EPERM;
 
-   vcpu->arch.has_run_once = true;
-
if (likely(irqchip_in_kernel(kvm))) {
/*
 * Map the VGIC hardware resources before running a vcpu the
@@ -591,6 +589,8 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
static_branch_inc(&userspace_irqchip_in_use);
}
 
+   vcpu->arch.has_run_once = true;
+
ret = kvm_timer_enable(vcpu);
if (ret)
return ret;
-- 
2.21.3



[PATCH v2 8/9] KVM: arm64: vgic-v3: Expose GICR_TYPER.Last for userspace

2021-01-14 Thread Eric Auger
Commit 23bde34771f1 ("KVM: arm64: vgic-v3: Drop the
reporting of GICR_TYPER.Last for userspace") temporarily fixed
a bug identified when attempting to access the GICR_TYPER
register before the redistributor region setting but dropped
the support of the LAST bit. Emulating the GICR_TYPER.Last bit
still makes sense for architecture compliance.

This patch restores its support (if the redistributor region
was set) while keeping the code safe.

Signed-off-by: Eric Auger 
---
 arch/arm64/kvm/vgic/vgic-mmio-v3.c | 7 ++-
 include/kvm/arm_vgic.h | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 987e366c8000..7ff23c153128 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -277,6 +277,8 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct 
kvm_vcpu *vcpu,
 gpa_t addr, unsigned int len)
 {
unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+   struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+   struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
int target_vcpu_id = vcpu->vcpu_id;
u64 value;
 
@@ -286,7 +288,9 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct 
kvm_vcpu *vcpu,
if (vgic_has_its(vcpu->kvm))
value |= GICR_TYPER_PLPIS;
 
-   /* reporting of the Last bit is not supported for userspace */
+   if (rdreg && (vgic_cpu->rdreg_index == (rdreg->free_index - 1)))
+   value |= GICR_TYPER_LAST;
+
return extract_bytes(value, addr & 7, len);
 }
 
@@ -714,6 +718,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
return -EINVAL;
 
vgic_cpu->rdreg = rdreg;
+   vgic_cpu->rdreg_index = rdreg->free_index;
 
rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 3d74f1060bd1..ec621180ef09 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -322,6 +322,7 @@ struct vgic_cpu {
 */
struct vgic_io_device   rd_iodev;
struct vgic_redist_region *rdreg;
+   u32 rdreg_index;
 
/* Contains the attributes and gpa of the LPI pending tables. */
u64 pendbaser;
-- 
2.21.3



[PATCH v2 7/9] KVM: arm64: Simplify argument passing to vgic_uaccess_[read|write]

2021-01-14 Thread Eric Auger
vgic_uaccess() takes a struct vgic_io_device argument, converts it
to a struct kvm_io_device and passes it to the read/write accessor
functions, which convert it back to a struct vgic_io_device.
Avoid the indirection by passing the struct vgic_io_device argument
directly to vgic_uaccess_{read,write}.

Signed-off-by: Eric Auger 

---

v1 -> v2:
- reworded the commit message as suggested by Alexandru
---
 arch/arm64/kvm/vgic/vgic-mmio.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index b2d73fc0d1ef..48c6067fc5ec 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -938,10 +938,9 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct 
vgic_io_device *iodev,
return region;
 }
 
-static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device 
*iodev,
 gpa_t addr, u32 *val)
 {
-   struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
const struct vgic_register_region *region;
struct kvm_vcpu *r_vcpu;
 
@@ -960,10 +959,9 @@ static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct 
kvm_io_device *dev,
return 0;
 }
 
-static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device 
*iodev,
  gpa_t addr, const u32 *val)
 {
-   struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
const struct vgic_register_region *region;
struct kvm_vcpu *r_vcpu;
 
@@ -986,9 +984,9 @@ int vgic_uaccess(struct kvm_vcpu *vcpu, struct 
vgic_io_device *dev,
 bool is_write, int offset, u32 *val)
 {
if (is_write)
-   return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
+   return vgic_uaccess_write(vcpu, dev, offset, val);
else
-   return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
+   return vgic_uaccess_read(vcpu, dev, offset, val);
 }
 
 static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-- 
2.21.3



[PATCH v2 9/9] KVM: selftests: aarch64/vgic-v3 init sequence tests

2021-01-14 Thread Eric Auger
The tests exercise the VGIC_V3 device creation including the
associated KVM_DEV_ARM_VGIC_GRP_ADDR group attributes:

- KVM_VGIC_V3_ADDR_TYPE_DIST/REDIST (vcpu_first and vgic_first)
- KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION (redist_regions).

Another test dedicates to KVM_DEV_ARM_VGIC_GRP_REDIST_REGS group
and especially the GICR_TYPER read. The goal was to test the case
recently fixed by commit 23bde34771f1
("KVM: arm64: vgic-v3: Drop the reporting of GICR_TYPER.Last for userspace").

The API under test can be found at
Documentation/virt/kvm/devices/arm-vgic-v3.rst

Signed-off-by: Eric Auger 
---
 tools/testing/selftests/kvm/Makefile  |   1 +
 .../testing/selftests/kvm/aarch64/vgic_init.c | 453 ++
 .../testing/selftests/kvm/include/kvm_util.h  |   5 +
 tools/testing/selftests/kvm/lib/kvm_util.c|  51 ++
 4 files changed, 510 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/aarch64/vgic_init.c

diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index fe41c6a0fa67..c3c2cb6fe8a1 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -68,6 +68,7 @@ TEST_GEN_PROGS_x86_64 += steal_time
 
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
+TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
 TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c 
b/tools/testing/selftests/kvm/aarch64/vgic_init.c
new file mode 100644
index ..e8caa64c0395
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vgic init sequence tests
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define NR_VCPUS   4
+
+#define REDIST_REGION_ATTR_ADDR(count, base, flags, index) (((uint64_t)(count) 
<< 52) | \
+   ((uint64_t)((base) >> 16) << 16) | ((uint64_t)(flags) << 12) | index)
+#define REG_OFFSET(vcpu, offset) (((uint64_t)vcpu << 32) | offset)
+
+#define GICR_TYPER 0x8
+
+static int access_redist_reg(int gicv3_fd, int vcpu, int offset,
+uint32_t *val, bool write)
+{
+   uint64_t attr = REG_OFFSET(vcpu, offset);
+
+   return kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS,
+attr, val, write);
+}
+
+static void guest_code(int cpu)
+{
+   GUEST_SYNC(0);
+   GUEST_SYNC(1);
+   GUEST_SYNC(2);
+   GUEST_DONE();
+}
+
+static int run_vcpu(struct kvm_vm *vm, uint32_t vcpuid)
+{
+   static int run;
+   struct ucall uc;
+   int ret;
+
+   vcpu_args_set(vm, vcpuid, 1, vcpuid);
+   ret = _vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+   get_ucall(vm, vcpuid, &uc);
+   run++;
+
+   if (ret)
+   return -errno;
+   return 0;
+}
+
+int dist_rdist_tests(struct kvm_vm *vm)
+{
+   int ret, gicv3_fd, max_ipa_bits;
+   uint64_t addr;
+
+   max_ipa_bits = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+   ret = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, true);
+   if (ret) {
+   print_skip("GICv3 not supported");
+   exit(KSFT_SKIP);
+   }
+
+   ret = kvm_create_device(vm, 0, true);
+   TEST_ASSERT(ret == -ENODEV, "unsupported device");
+
+   /* Create the device */
+
+   gicv3_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, false);
+   TEST_ASSERT(gicv3_fd > 0, "GICv3 device created");
+
+   /* Check attributes */
+
+   ret = kvm_device_check_attr(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+   KVM_VGIC_V3_ADDR_TYPE_DIST);
+   TEST_ASSERT(!ret, "KVM_DEV_ARM_VGIC_GRP_ADDR/KVM_VGIC_V3_ADDR_TYPE_DIST 
supported");
+
+   ret = kvm_device_check_attr(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+   KVM_VGIC_V3_ADDR_TYPE_REDIST);
+   TEST_ASSERT(!ret, 
"KVM_DEV_ARM_VGIC_GRP_ADDR/KVM_VGIC_V3_ADDR_TYPE_REDIST supported");
+
+   ret = kvm_device_check_attr(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 0);
+   TEST_ASSERT(ret == -ENXIO, "attribute not supported");
+
+   /* misaligned DIST and REDIST addresses */
+
+   addr = 0x1000;
+   ret = kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+   KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true);
+   TEST_ASSERT(ret == -EINVAL, "GICv3 dist base not 64kB aligned");
+
+   ret = kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+   KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true);
+   TEST

[PATCH v2 6/9] docs: kvm: devices/arm-vgic-v3: enhance KVM_DEV_ARM_VGIC_CTRL_INIT doc

2021-01-14 Thread Eric Auger
kvm_arch_vcpu_precreate() returns -EBUSY if the vgic is
already initialized. So let's document that KVM_DEV_ARM_VGIC_CTRL_INIT
must be called after all vcpu creations.

Signed-off-by: Eric Auger 

---

v1 -> v2:
- Must be called after all vcpu creations ->
  Must be called after all VCPUs have been created as per
  Alexandru's suggestion
---
 Documentation/virt/kvm/devices/arm-vgic-v3.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.rst 
b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
index 5dd3bff51978..51e5e5762571 100644
--- a/Documentation/virt/kvm/devices/arm-vgic-v3.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
@@ -228,7 +228,7 @@ Groups:
 
 KVM_DEV_ARM_VGIC_CTRL_INIT
   request the initialization of the VGIC, no additional parameter in
-  kvm_device_attr.addr.
+  kvm_device_attr.addr. Must be called after all VCPUs have been created.
 KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES
   save all LPI pending bits into guest RAM pending tables.
 
-- 
2.21.3



[PATCH v2 2/9] KVM: arm64: Fix KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION read

2021-01-14 Thread Eric Auger
The doc says:
"The characteristics of a specific redistributor region can
 be read by presetting the index field in the attr data.
 Only valid for KVM_DEV_TYPE_ARM_VGIC_V3"

Unfortunately the existing code fails to read the input attr data.

Fixes: 04c110932225 ("KVM: arm/arm64: Implement 
KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION")
Cc: sta...@vger.kernel.org#v4.17+
Signed-off-by: Eric Auger 
Reviewed-by: Alexandru Elisei 

---

v1 -> v2:
- in the commit message, remove the statement that the index always is 0
- add Alexandru's R-b
---
 arch/arm64/kvm/vgic/vgic-kvm-device.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c 
b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index 44419679f91a..2f66cf247282 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -226,6 +226,9 @@ static int vgic_get_common_attr(struct kvm_device *dev,
u64 addr;
unsigned long type = (unsigned long)attr->attr;
 
+   if (copy_from_user(&addr, uaddr, sizeof(addr)))
+   return -EFAULT;
+
r = kvm_vgic_addr(dev->kvm, type, &addr, false);
if (r)
return (r == -ENODEV) ? -ENXIO : r;
-- 
2.21.3



[PATCH v2 4/9] KVM: arm/arm64: vgic: Reset base address on kvm_vgic_dist_destroy()

2021-01-14 Thread Eric Auger
On vgic_dist_destroy(), the addresses are not reset. However for
kvm selftest purpose this would allow to continue the test execution
even after a failure when running KVM_RUN. So let's reset the
base addresses.

Signed-off-by: Eric Auger 

---

v1 -> v2:
- use dist-> in the else and add braces
---
 arch/arm64/kvm/vgic/vgic-init.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 052917deb149..cf6faa0aeddb 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -335,13 +335,16 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
kfree(dist->spis);
dist->spis = NULL;
dist->nr_spis = 0;
+   dist->vgic_dist_base = VGIC_ADDR_UNDEF;
 
-   if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+   if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
list_del(&rdreg->list);
kfree(rdreg);
}
INIT_LIST_HEAD(&dist->rd_regions);
+   } else {
+   dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
}
 
if (vgic_has_its(kvm))
@@ -362,6 +365,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
vgic_flush_pending_lpis(vcpu);
 
INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+   vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
 }
 
 /* To be called with kvm->lock held */
-- 
2.21.3



[PATCH v2 3/9] KVM: arm64: vgic-v3: Fix error handling in vgic_v3_set_redist_base()

2021-01-14 Thread Eric Auger
vgic_v3_insert_redist_region() may succeed while
vgic_register_all_redist_iodevs fails. For example this happens
while adding a redistributor region overlapping a dist region. The
failure only is detected on vgic_register_all_redist_iodevs when
vgic_v3_check_base() gets called in vgic_register_redist_iodev().

In such a case, remove the newly added redistributor region and free
it.

Signed-off-by: Eric Auger 

---

v1 -> v2:
- fix the commit message and split declaration/assignment of rdreg
---
 arch/arm64/kvm/vgic/vgic-mmio-v3.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 013b737b658f..987e366c8000 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -860,8 +860,14 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, 
u64 addr, u32 count)
 * afterwards will register the iodevs when needed.
 */
ret = vgic_register_all_redist_iodevs(kvm);
-   if (ret)
+   if (ret) {
+   struct vgic_redist_region *rdreg;
+
+   rdreg = vgic_v3_rdist_region_from_index(kvm, index);
+   list_del(&rdreg->list);
+   kfree(rdreg);
return ret;
+   }
 
return 0;
 }
-- 
2.21.3



[PATCH v2 1/9] KVM: arm64: vgic-v3: Fix some error codes when setting RDIST base

2021-01-14 Thread Eric Auger
KVM_DEV_ARM_VGIC_GRP_ADDR group doc says we should return
-EEXIST in case the base address of the redist is already set.
We currently return -EINVAL.

However we need to return -EINVAL in case a legacy REDIST address
is attempted to be set while REDIST_REGIONS were set. This case
is discriminated by looking at the count field.

Signed-off-by: Eric Auger 

---

v1 -> v2:
- simplify the check sequence
---
 arch/arm64/kvm/vgic/vgic-mmio-v3.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 15a6c98ee92f..013b737b658f 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -791,10 +791,6 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, 
uint32_t index,
size_t size = count * KVM_VGIC_V3_REDIST_SIZE;
int ret;
 
-   /* single rdist region already set ?*/
-   if (!count && !list_empty(rd_regions))
-   return -EINVAL;
-
/* cross the end of memory ? */
if (base + size < base)
return -EINVAL;
@@ -805,11 +801,14 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, 
uint32_t index,
} else {
rdreg = list_last_entry(rd_regions,
struct vgic_redist_region, list);
-   if (index != rdreg->index + 1)
-   return -EINVAL;
 
-   /* Cannot add an explicitly sized regions after legacy region */
-   if (!rdreg->count)
+   if ((!count) != (!rdreg->count))
+   return -EINVAL; /* Mix REDIST and REDIST_REGION */
+
+   if (!count)
+   return -EEXIST;
+
+   if (index != rdreg->index + 1)
return -EINVAL;
}
 
-- 
2.21.3



[PATCH v2 0/9] KVM/ARM: Some vgic fixes and init sequence KVM selftests

2021-01-14 Thread Eric Auger
While writting vgic v3 init sequence KVM selftests I noticed some
relatively minor issues. This was also the opportunity to try to
fix the issue laterly reported by Zenghui, related to the RDIST_TYPER
last bit emulation. The final patch is a first batch of VGIC init
sequence selftests. Of course they can be augmented with a lot more
register access tests, but let's try to move forward incrementally ...

Best Regards

Eric

This series can be found at:
https://github.com/eauger/linux/tree/vgic_kvmselftests_v2

History:
- Took into account all comments from Marc and Alexandru's except
  the has_run_once still after the map_resources (this would oblige
  me to revisit in depth the selftests)

Eric Auger (9):
  KVM: arm64: vgic-v3: Fix some error codes when setting RDIST base
  KVM: arm64: Fix KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION read
  KVM: arm64: vgic-v3: Fix error handling in vgic_v3_set_redist_base()
  KVM: arm/arm64: vgic: Reset base address on kvm_vgic_dist_destroy()
  KVM: arm: move has_run_once after the map_resources
  docs: kvm: devices/arm-vgic-v3: enhance KVM_DEV_ARM_VGIC_CTRL_INIT doc
  KVM: arm64: Simplify argument passing to vgic_uaccess_[read|write]
  KVM: arm64: vgic-v3: Expose GICR_TYPER.Last for userspace
  KVM: selftests: aarch64/vgic-v3 init sequence tests

 .../virt/kvm/devices/arm-vgic-v3.rst  |   2 +-
 arch/arm64/kvm/arm.c  |   4 +-
 arch/arm64/kvm/vgic/vgic-init.c   |   6 +-
 arch/arm64/kvm/vgic/vgic-kvm-device.c |   3 +
 arch/arm64/kvm/vgic/vgic-mmio-v3.c|  30 +-
 arch/arm64/kvm/vgic/vgic-mmio.c   |  10 +-
 include/kvm/arm_vgic.h|   1 +
 tools/testing/selftests/kvm/Makefile  |   1 +
 .../testing/selftests/kvm/aarch64/vgic_init.c | 453 ++
 .../testing/selftests/kvm/include/kvm_util.h  |   5 +
 tools/testing/selftests/kvm/lib/kvm_util.c|  51 ++
 11 files changed, 546 insertions(+), 20 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/aarch64/vgic_init.c

-- 
2.21.3



Re: [PATCH 8/9] KVM: arm64: vgic-v3: Expose GICR_TYPER.Last for userspace

2021-01-14 Thread Auger Eric
Hi Alexandru,

On 1/12/21 6:02 PM, Alexandru Elisei wrote:
> Hi Eric,
> 
> On 12/12/20 6:50 PM, Eric Auger wrote:
>> Commit 23bde34771f1 ("KVM: arm64: vgic-v3: Drop the
>> reporting of GICR_TYPER.Last for userspace") temporarily fixed
>> a bug identified when attempting to access the GICR_TYPER
>> register before the redistributor region setting but dropped
>> the support of the LAST bit. This patch restores its
>> support (if the redistributor region was set) while keeping the
>> code safe.
> 
> I suppose the reason for emulating GICR_TYPER.Last is for architecture 
> compliance,
> right? I think that should be in the commit message.
OK added this in the commit msg.
> 
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  arch/arm64/kvm/vgic/vgic-mmio-v3.c | 7 ++-
>>  include/kvm/arm_vgic.h | 1 +
>>  2 files changed, 7 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
>> b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> index 581f0f49..2f9ef6058f6e 100644
>> --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> @@ -277,6 +277,8 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct 
>> kvm_vcpu *vcpu,
>>   gpa_t addr, unsigned int len)
>>  {
>>  unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
>> +struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
>> +struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
>>  int target_vcpu_id = vcpu->vcpu_id;
>>  u64 value;
>>  
>> @@ -286,7 +288,9 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct 
>> kvm_vcpu *vcpu,
>>  if (vgic_has_its(vcpu->kvm))
>>  value |= GICR_TYPER_PLPIS;
>>  
>> -/* reporting of the Last bit is not supported for userspace */
>> +if (rdreg && (vgic_cpu->rdreg_index == (rdreg->free_index - 1)))
>> +value |= GICR_TYPER_LAST;
>> +
>>  return extract_bytes(value, addr & 7, len);
>>  }
>>  
>> @@ -714,6 +718,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
>>  return -EINVAL;
>>  
>>  vgic_cpu->rdreg = rdreg;
>> +vgic_cpu->rdreg_index = rdreg->free_index;
> 
> What happens if the next redistributor region we register has the base address
> adjacent to this one?
> 
> I'm really not familiar with the code, but is it not possible to create two
> Redistributor regions (via
> KVM_DEV_ARM_VGIC_GRP_ADDR(KVM_VGIC_V3_ADDR_TYPE_REDIST)) where the second
> Redistributor region start address is immediately after the last 
> Redistributor in
> the preceding region?
KVM_VGIC_V3_ADDR_TYPE_REDIST only allows to create a single rdist
region. Only KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION allows to register
several of them.

with KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, it is possible to register
adjacent rdist regions. vgic_v3_rdist_free_slot() previously returned
the 1st rdist region where enough space remains for inserting the new
reg. We put the rdist at the free index there.

But maybe I misunderstood your question?

Thanks

Eric
> 
> Thanks,
> Alex
>>  
>>  rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
>>  
>> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
>> index a8d8fdcd3723..596c069263a7 100644
>> --- a/include/kvm/arm_vgic.h
>> +++ b/include/kvm/arm_vgic.h
>> @@ -322,6 +322,7 @@ struct vgic_cpu {
>>   */
>>  struct vgic_io_device   rd_iodev;
>>  struct vgic_redist_region *rdreg;
>> +u32 rdreg_index;
>>  
>>  /* Contains the attributes and gpa of the LPI pending tables. */
>>  u64 pendbaser;
> 



Re: [PATCH 5/9] KVM: arm: move has_run_once after the map_resources

2021-01-14 Thread Auger Eric
Hi Alexandru,

On 1/12/21 3:55 PM, Alexandru Elisei wrote:
> Hi Eric,
> 
> On 12/12/20 6:50 PM, Eric Auger wrote:
>> has_run_once is set to true at the beginning of
>> kvm_vcpu_first_run_init(). This generally is not an issue
>> except when exercising the code with KVM selftests. Indeed,
>> if kvm_vgic_map_resources() fails due to erroneous user settings,
>> has_run_once is set and this prevents from continuing
>> executing the test. This patch moves the assignment after the
>> kvm_vgic_map_resources().
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  arch/arm64/kvm/arm.c | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
>> index c0ffb019ca8b..331fae6bff31 100644
>> --- a/arch/arm64/kvm/arm.c
>> +++ b/arch/arm64/kvm/arm.c
>> @@ -540,8 +540,6 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
>>  if (!kvm_arm_vcpu_is_finalized(vcpu))
>>  return -EPERM;
>>  
>> -vcpu->arch.has_run_once = true;
>> -
>>  if (likely(irqchip_in_kernel(kvm))) {
>>  /*
>>   * Map the VGIC hardware resources before running a vcpu the
>> @@ -560,6 +558,8 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
>>  static_branch_inc(&userspace_irqchip_in_use);
>>  }
>>  
>> +vcpu->arch.has_run_once = true;
> 
> I have a few concerns regarding this:
> 
> 1. Moving has_run_once = true here seems very arbitrary to me - 
> kvm_timer_enable()
> and kvm_arm_pmu_v3_enable(), below it, can both fail because of erroneous user
> values. If there's a reason why the assignment cannot be moved at the end of 
> the
> function, I think it should be clearly stated in a comment for the people who
> might be tempted to write similar tests for the timer or pmu.

Setting has_run_once = true at the entry of the function looks to me
even more arbitrary. I agree with you that eventually has_run_once may
be moved at the very end but maybe this can be done later once timer,
pmu tests haven ben written
> 
> 2. There are many ways that kvm_vgic_map_resources() can fail, other than
> incorrect user settings. I started digging into how
> kvm_vgic_map_resources()->vgic_v2_map_resources() can fail for a VGIC V2 and 
> this
> is what I managed to find before I gave up:
> 
> * vgic_init() can fail in:
>     - kvm_vgic_dist_init()
>     - vgic_v3_init()
>     - kvm_vgic_setup_default_irq_routing()
> * vgic_register_dist_iodev() can fail in:
>     - vgic_v3_init_dist_iodev()
>     - kvm_io_bus_register_dev()(*)
> * kvm_phys_addr_ioremap() can fail in:
>     - kvm_mmu_topup_memory_cache()
>     - kvm_pgtable_stage2_map()

I changed the commit msg so that "incorrect user settings" sounds as an
example.
> 
> So if any of the functions below fail, are we 100% sure it is safe to allow 
> the
> user to execute kvm_vgic_map_resources() again?

I think additional tests will confirm this. However at the moment,
moving the assignment, which does not look wrong to me, allows to
greatly simplify the tests so I would tend to say that it is worth.
> 
> (*) It looks to me like kvm_io_bus_register_dev() doesn't take into account a
> caller that tries to register the same device address range and it will create
> another identical range. Is this intentional? Is it a bug that should be 
> fixed? Or
> am I misunderstanding the function?

doesn't kvm_io_bus_cmp() do the check?

Thanks

Eric
> 
> Thanks,
> Alex
>> +
>>  ret = kvm_timer_enable(vcpu);
>>  if (ret)
>>  return ret;
> 



Re: [PATCH 1/9] KVM: arm64: vgic-v3: Fix some error codes when setting RDIST base

2021-01-14 Thread Auger Eric
Hi Alexandru,

On 1/6/21 5:32 PM, Alexandru Elisei wrote:
> Hi Eric,
> 
> On 12/12/20 6:50 PM, Eric Auger wrote:
>> KVM_DEV_ARM_VGIC_GRP_ADDR group doc says we should return
>> -EEXIST in case the base address of the redist is already set.
>> We currently return -EINVAL.
>>
>> However we need to return -EINVAL in case a legacy REDIST address
>> is attempted to be set while REDIST_REGIONS were set. This case
>> is discriminated by looking at the count field.
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  arch/arm64/kvm/vgic/vgic-mmio-v3.c | 9 +++--
>>  1 file changed, 7 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
>> b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> index 15a6c98ee92f..8e8a862def76 100644
>> --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> @@ -792,8 +792,13 @@ static int vgic_v3_insert_redist_region(struct kvm 
>> *kvm, uint32_t index,
>>  int ret;
>>  
>>  /* single rdist region already set ?*/
>> -if (!count && !list_empty(rd_regions))
>> -return -EINVAL;
>> +if (!count && !list_empty(rd_regions)) {
>> +rdreg = list_last_entry(rd_regions,
>> +   struct vgic_redist_region, list);
>> +if (rdreg->count)
>> +return -EINVAL; /* Mixing REDIST and REDIST_REGION API 
>> */
>> +return -EEXIST;
>> +}
> 
> A few instructions below:
> 
>     if (list_empty(rd_regions)) {
>         [..]
>     } else {
>         rdreg = list_last_entry(rd_regions,
>                     struct vgic_redist_region, list);
>         [..]
> 
>         /* Cannot add an explicitly sized regions after legacy region */
>         if (!rdreg->count)
>             return -EINVAL;
>     }
> 
> Isn't this testing for the same thing, but using the opposite condition? Or 
> am I
> misunderstanding the code (quite likely)?
the 1st test sequence handles the case where the legacy
KVM_VGIC_V3_ADDR_TYPE_REDIST is used (!count) while the second handles
the case where the REDIST_REGION is used. Nevertheless I think this can
be simplified into:

if (list_empty(rd_regions)) {
if (index != 0)
return -EINVAL;
} else {
rdreg = list_last_entry(rd_regions,
struct vgic_redist_region, list);

if ((!count) != (!rdreg->count))
return -EINVAL; /* Mix REDIST and REDIST_REGION */

if (!count)
return -EEXIST;

if (index != rdreg->index + 1)
return -EINVAL;
}






> 
> Looks to me like 
> KVM_DEV_ARM_VGIC_GRP_ADDR(KVM_VGIC_V3_ADDR_TYPE_REDIST{,_REGION})
> used to return -EEXIST (from vgic_check_ioaddr()) before commit ccc27bf5be7b7
> ("KVM: arm/arm64: Helper to register a new redistributor region") which added 
> the
> vgic_v3_insert_redist_region() function, so bringing back the -EEXIST return 
> code
> looks the right thing to me.

OK thank you for the detailed study.

Eric
> 
> Thanks,
> Alex
>>  
>>  /* cross the end of memory ? */
>>  if (base + size < base)
> 



Re: [PATCH v4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-13 Thread Eric Snowberg


> On Jan 13, 2021, at 1:41 PM, Jarkko Sakkinen 
>  wrote:
> 
> On Tue, Jan 12, 2021 at 02:57:39PM +, David Howells wrote:
>> Eric Snowberg  wrote:
>> 
>>>> On Dec 10, 2020, at 2:49 AM, David Howells  wrote:
>>>> 
>>>> Eric Snowberg  wrote:
>>>> 
>>>>> Add support for EFI_CERT_X509_GUID dbx entries. When a EFI_CERT_X509_GUID
>>>>> is found, it is added as an asymmetrical key to the .blacklist keyring.
>>>>> Anytime the .platform keyring is used, the keys in the .blacklist keyring
>>>>> are referenced, if a matching key is found, the key will be rejected.
>>>> 
>>>> Ummm...  Why this way and not as a blacklist key which takes up less space?
>>>> I'm guessing that you're using the key chain matching logic.  We really 
>>>> only
>>>> need to blacklist the key IDs.
>>> 
>>> I implemented it this way so that certs in the dbx would only impact 
>>> the .platform keyring. I was under the impression we didn’t want to have 
>>> Secure Boot UEFI db/dbx certs dictate keyring functionality within the 
>>> kernel
>>> itself. Meaning if we have a matching dbx cert in any other keyring 
>>> (builtin,
>>> secondary, ima, etc.), it would be allowed. If that is not how you’d like 
>>> to 
>>> see it done, let me know and I’ll make the change.
>> 
>> I wonder if that is that the right thing to do.  I guess this is a policy
>> decision and may depend on the particular user.
> 
> Why would you want to allow dbx entry in any keyring?

Today, DB and MOK certs go into the platform keyring.  These certs are only
referenced during kexec.  They can’t be used for other things like validating
kernel module signatures.  If we follow the same pattern, the DBX and MOKX 
entries
in the blacklist keyring should only impact kexec. 

Currently, Mickaël Salaün has another outstanding series to allow root to 
update 
the blacklist keyring.  I assume the use case for this is around certificates 
used 
within the kernel, for example revoking kernel module signatures.  The question 
I have
is, should another keyring be introduced?  One that carries DBX and MOKX, which 
just
correspond to certs/hashes in the platform keyring; this keyring would only be
referenced for kexec, just like the platform keyring is today. Then, the current
blacklist keyring would be used for everything internal to the kernel.



Re: [PATCH] tcp: fix TCP_USER_TIMEOUT with zero window

2021-01-13 Thread Eric Dumazet
On Wed, Jan 13, 2021 at 9:12 PM Enke Chen  wrote:
>
> From: Enke Chen 
>
> The TCP session does not terminate with TCP_USER_TIMEOUT when data
> remain untransmitted due to zero window.
>
> The number of unanswered zero-window probes (tcp_probes_out) is
> reset to zero with incoming acks irrespective of the window size,
> as described in tcp_probe_timer():
>
> RFC 1122 4.2.2.17 requires the sender to stay open indefinitely
> as long as the receiver continues to respond probes. We support
> this by default and reset icsk_probes_out with incoming ACKs.
>
> This counter, however, is the wrong one to be used in calculating the
> duration that the window remains closed and data remain untransmitted.
> Thanks to Jonathan Maxwell  for diagnosing the
> actual issue.
>
> In this patch a separate counter is introduced to track the number of
> zero-window probes that are not answered with any non-zero window ack.
> This new counter is used in determining when to abort the session with
> TCP_USER_TIMEOUT.
>

I think one possible issue would be that local congestion (full qdisc)
would abort early,
because tcp_model_timeout() assumes linear backoff.

Neal or Yuchung can further comment on that, it is late for me in France.

packetdrill test would be :

   0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
   +0 bind(3, ..., ...) = 0
   +0 listen(3, 1) = 0


   +0 < S 0:0(0) win 0 
   +0 > S. 0:0(0) ack 1 

  +.1 < . 1:1(0) ack 1 win 65530
   +0 accept(3, ..., ...) = 4

   +0 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0
   +0 write(4, ..., 24) = 24
   +0 > P. 1:25(24) ack 1
   +.1 < . 1:1(0) ack 25 win 65530
   +0 %{ assert tcpi_probes == 0, tcpi_probes; \
 assert tcpi_backoff == 0, tcpi_backoff }%

// install a qdisc dropping all packets
   +0 `tc qdisc delete dev tun0 root 2>/dev/null ; tc qdisc add dev
tun0 root pfifo limit 0`
   +0 write(4, ..., 24) = 24
   // When qdisc is congested we retry every 500ms therefore in theory
   // we'd retry 6 times before hitting 3s timeout. However, since we
   // estimate the elapsed time based on exp backoff of actual RTO (300ms),
   // we'd bail earlier with only 3 probes.
   +2.1 write(4, ..., 24) = -1
   +0 %{ assert tcpi_probes == 3, tcpi_probes; \
 assert tcpi_backoff == 0, tcpi_backoff }%
   +0 close(4) = 0

> Cc: sta...@vger.kernel.org
> Fixes: 9721e709fa68 ("tcp: simplify window probe aborting on USER_TIMEOUT")
> Reported-by: William McCall 
> Signed-off-by: Enke Chen 
> ---
>  include/linux/tcp.h   | 5 +
>  net/ipv4/tcp.c| 1 +
>  net/ipv4/tcp_input.c  | 3 ++-
>  net/ipv4/tcp_output.c | 2 ++
>  net/ipv4/tcp_timer.c  | 5 +++--
>  5 files changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 2f87377e9af7..c9415b30fa67 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -352,6 +352,11 @@ struct tcp_sock {
>
> int linger2;
>
> +   /* While icsk_probes_out is for unanswered 0 window probes, this
> +* counter is for 0-window probes that are not answered with any
> +* non-zero window (nzw) acks.
> +*/
> +   u8  probes_nzw;
>
>  /* Sock_ops bpf program related variables */
>  #ifdef CONFIG_BPF
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index ed42d2193c5c..af6a41a5a5ac 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -2940,6 +2940,7 @@ int tcp_disconnect(struct sock *sk, int flags)
> icsk->icsk_rto = TCP_TIMEOUT_INIT;
> icsk->icsk_rto_min = TCP_RTO_MIN;
> icsk->icsk_delack_max = TCP_DELACK_MAX;
> +   tp->probes_nzw = 0;
> tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
> tp->snd_cwnd = TCP_INIT_CWND;
> tp->snd_cwnd_cnt = 0;
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index c7e16b0ed791..4812a969c18a 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3377,13 +3377,14 @@ static void tcp_ack_probe(struct sock *sk)
>  {
> struct inet_connection_sock *icsk = inet_csk(sk);
> struct sk_buff *head = tcp_send_head(sk);
> -   const struct tcp_sock *tp = tcp_sk(sk);
> +   struct tcp_sock *tp = tcp_sk(sk);
>
> /* Was it a usable window open? */
> if (!head)
> return;
> if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
> icsk->icsk_backoff = 0;
> +   tp->probes_nzw = 0;
> inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
> /* Socket must be waked up by subsequent tcp_data_snd_check().
>  * This function is not for random using!
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index f322e798a351..1b64cdabc299 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -4084,10 +4084,12 @@ void tcp_send_probe0(struct sock *sk)
> /* Cancel probe timer, if it is not required. */
> 

Re: [PATCH 3/9] KVM: arm64: vgic-v3: Fix error handling in vgic_v3_set_redist_base()

2021-01-13 Thread Auger Eric
Hi Marc,

On 12/28/20 4:35 PM, Marc Zyngier wrote:
> Hi Eric,
> 
> On Sat, 12 Dec 2020 18:50:04 +0000,
> Eric Auger  wrote:
>>
>> vgic_register_all_redist_iodevs may succeed while
>> vgic_register_all_redist_iodevs fails. For example this can happen
> 
> The same function cannot both fail and succeed ;-) Can you shed some
> light on what you had in mind?

Damn, I meant vgic_v3_insert_redist_region() can be successful and then
vgic_register_all_redist_iodevs() fails due to detection of overlap.
> 
>> while adding a redistributor region overlapping a dist region. The
>> failure only is detected on vgic_register_all_redist_iodevs when
>> vgic_v3_check_base() gets called.
>>
>> In such a case, remove the newly added redistributor region and free
>> it.
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  arch/arm64/kvm/vgic/vgic-mmio-v3.c | 8 +++-
>>  1 file changed, 7 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c 
>> b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> index 8e8a862def76..581f0f49 100644
>> --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
>> @@ -866,8 +866,14 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, 
>> u64 addr, u32 count)
>>   * afterwards will register the iodevs when needed.
>>   */
>>  ret = vgic_register_all_redist_iodevs(kvm);
>> -if (ret)
>> +if (ret) {
>> +struct vgic_redist_region *rdreg =
>> +    vgic_v3_rdist_region_from_index(kvm, index);
>> +
> 
> nit: consider splitting declaration and assignment so that we avoid
> the line split if you insist on the 80 character limit.
Sure

Thanks

Eric
> 
>> +list_del(&rdreg->list);
>> +kfree(rdreg);
>>  return ret;
>> +}
>>  
>>  return 0;
>>  }
>> -- 
>> 2.21.3
>>
>>
> 
> Thanks,
> 
>   M.
> 



Re: [PATCH 6/9] docs: kvm: devices/arm-vgic-v3: enhance KVM_DEV_ARM_VGIC_CTRL_INIT doc

2021-01-13 Thread Auger Eric
Hi Alexandru,

On 1/12/21 4:39 PM, Alexandru Elisei wrote:
> Hi Eric,
> 
> On 12/12/20 6:50 PM, Eric Auger wrote:
>> kvm_arch_vcpu_precreate() returns -EBUSY if the vgic is
>> already initialized. So let's document that KVM_DEV_ARM_VGIC_CTRL_INIT
>> must be called after all vcpu creations.
> 
> Checked and this is indeed the case,
> kvm_vm_ioctl_create_vcpu()->kvm_arch_vcpu_precreate() returns -EBUSY is
> vgic_initialized() is true.
thanks!
> 
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  Documentation/virt/kvm/devices/arm-vgic-v3.rst | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.rst 
>> b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
>> index 5dd3bff51978..322de6aebdec 100644
>> --- a/Documentation/virt/kvm/devices/arm-vgic-v3.rst
>> +++ b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
>> @@ -228,7 +228,7 @@ Groups:
>>  
>>  KVM_DEV_ARM_VGIC_CTRL_INIT
>>request the initialization of the VGIC, no additional parameter in
>> -  kvm_device_attr.addr.
>> +  kvm_device_attr.addr. Must be called after all vcpu creations.
> 
> Nitpick here: the document writes VCPU with all caps. This also sounds a bit
> weird, I think something like "Must be called after all VCPUs have been 
> created"
> is clearer.
I took your suggestion.

Thanks

Eric
> 
> Thanks,
> Alex
>>  KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES
>>save all LPI pending bits into guest RAM pending tables.
>>  
> 



Re: [PATCH 4/9] KVM: arm/arm64: vgic: Reset base address on kvm_vgic_dist_destroy()

2021-01-13 Thread Auger Eric
Hi Marc,

On 12/28/20 4:41 PM, Marc Zyngier wrote:
> On Sat, 12 Dec 2020 18:50:05 +,
> Eric Auger  wrote:
>>
>> On vgic_dist_destroy(), the addresses are not reset. However for
>> kvm selftest purpose this would allow to continue the test execution
>> even after a failure when running KVM_RUN. So let's reset the
>> base addresses.
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  arch/arm64/kvm/vgic/vgic-init.c | 7 +--
>>  1 file changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-init.c 
>> b/arch/arm64/kvm/vgic/vgic-init.c
>> index 32e32d67a127..6147bed56b1b 100644
>> --- a/arch/arm64/kvm/vgic/vgic-init.c
>> +++ b/arch/arm64/kvm/vgic/vgic-init.c
>> @@ -335,14 +335,16 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
>>  kfree(dist->spis);
>>  dist->spis = NULL;
>>  dist->nr_spis = 0;
>> +dist->vgic_dist_base = VGIC_ADDR_UNDEF;
>>  
>> -if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
>> +if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
>>  list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
>>  list_del(&rdreg->list);
>>  kfree(rdreg);
>>  }
>>  INIT_LIST_HEAD(&dist->rd_regions);
>> -}
>> +} else
>> +    kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
> 
> Since you have converted the hunk above to use dist->, you could do
> the same thing here. And the coding style dictates that you need {} on
> the else side as well.
sure

Thanks

Eric
> 
>>
>>  if (vgic_has_its(kvm))
>>  vgic_lpi_translation_cache_destroy(kvm);
>> @@ -362,6 +364,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
>>  vgic_flush_pending_lpis(vcpu);
>>  
>>  INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
>> +vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
>>  }
>>  
>>  /* To be called with kvm->lock held */
>> -- 
>> 2.21.3
>>
>>
> 
> Thanks,
> 
>   M.
> 



Re: [PATCH 2/9] KVM: arm64: Fix KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION read

2021-01-13 Thread Auger Eric
Hi Alexandru,

On 1/6/21 6:12 PM, Alexandru Elisei wrote:
> Hi Eric,
> 
> The patch looks correct to me. kvm_vgic_addr() masks out all the bits except 
> index
> from addr, so we don't need to do it in vgic_get_common_attr():
> 
> Reviewed-by: Alexandru Elisei 
> 
> One nitpick below.
> 
> On 12/12/20 6:50 PM, Eric Auger wrote:
>> The doc says:
>> "The characteristics of a specific redistributor region can
>>  be read by presetting the index field in the attr data.
>>  Only valid for KVM_DEV_TYPE_ARM_VGIC_V3"
>>
>> Unfortunately the existing code fails to read the input attr data
>> and thus the index always is 0.
> 
> addr is allocated on the stack, I don't think it will always be 0.
I removed this statement in the commit message. Thanks!

Eric
> 
> Thanks,
> Alex
>>
>> Fixes: 04c110932225 ("KVM: arm/arm64: Implement 
>> KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION")
>> Cc: sta...@vger.kernel.org#v4.17+
>> Signed-off-by: Eric Auger 
>> ---
>>  arch/arm64/kvm/vgic/vgic-kvm-device.c | 3 +++
>>  1 file changed, 3 insertions(+)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c 
>> b/arch/arm64/kvm/vgic/vgic-kvm-device.c
>> index 44419679f91a..2f66cf247282 100644
>> --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
>> +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
>> @@ -226,6 +226,9 @@ static int vgic_get_common_attr(struct kvm_device *dev,
>>  u64 addr;
>>  unsigned long type = (unsigned long)attr->attr;
>>  
>> +if (copy_from_user(&addr, uaddr, sizeof(addr)))
>> +return -EFAULT;
>> +
>>  r = kvm_vgic_addr(dev->kvm, type, &addr, false);
>>  if (r)
>>  return (r == -ENODEV) ? -ENXIO : r;
> 



Re: [PATCH 7/9] KVM: arm64: Simplify argument passing to vgic_uaccess_[read|write]

2021-01-13 Thread Auger Eric
Hi Alexandru,
On 1/12/21 5:16 PM, Alexandru Elisei wrote:
> Hi Eric,
> 
> On 1/12/21 4:04 PM, Alexandru Elisei wrote:
>> Hi Eric,
>>
>> On 12/12/20 6:50 PM, Eric Auger wrote:
>>> Instead of converting the vgic_io_device handle to a kvm_io_device
>>> handled and then do the oppositive, pass a vgic_io_device pointer all
>>> along the call chain.
>> To me, it looks like the commit message describes what the patch does 
>> instead of
>> why it does it.
>>
>> What are "vgic_io_device handle" and "kvm_io_device handled"?
Yes unfortunate typo, sorry.
> 
> Sorry, I think I got it now. You were referring to the argument types struct
> vgic_io_device and struct kvm_io_device. The patch looks like a very good 
> cleanup.
> 
> How changing to commit message to sound something like this (feel free to
> ignore/change it if you think of something else):
> 
> vgic_uaccess() takes a struct vgic_io_device argument, converts it to a struct
> kvm_io_device and passes it to the read/write accessor functions, which 
> convert it
> back to a struct vgic_io_device. Avoid the indirection by passing the struct
> vgic_io_device argument directly to vgic_uaccess_{read,write).
I reworded the commit message as you suggested.

Thanks

Eric
> 
> Thanks,
> Alex
>>
>> Thanks,
>> Alex
>>> Signed-off-by: Eric Auger 
>>> ---
>>>  arch/arm64/kvm/vgic/vgic-mmio.c | 10 --
>>>  1 file changed, 4 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c 
>>> b/arch/arm64/kvm/vgic/vgic-mmio.c
>>> index b2d73fc0d1ef..48c6067fc5ec 100644
>>> --- a/arch/arm64/kvm/vgic/vgic-mmio.c
>>> +++ b/arch/arm64/kvm/vgic/vgic-mmio.c
>>> @@ -938,10 +938,9 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct 
>>> vgic_io_device *iodev,
>>> return region;
>>>  }
>>>  
>>> -static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device 
>>> *dev,
>>> +static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device 
>>> *iodev,
>>>  gpa_t addr, u32 *val)
>>>  {
>>> -   struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
>>> const struct vgic_register_region *region;
>>> struct kvm_vcpu *r_vcpu;
>>>  
>>> @@ -960,10 +959,9 @@ static int vgic_uaccess_read(struct kvm_vcpu *vcpu, 
>>> struct kvm_io_device *dev,
>>> return 0;
>>>  }
>>>  
>>> -static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device 
>>> *dev,
>>> +static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device 
>>> *iodev,
>>>   gpa_t addr, const u32 *val)
>>>  {
>>> -   struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
>>> const struct vgic_register_region *region;
>>> struct kvm_vcpu *r_vcpu;
>>>  
>>> @@ -986,9 +984,9 @@ int vgic_uaccess(struct kvm_vcpu *vcpu, struct 
>>> vgic_io_device *dev,
>>>  bool is_write, int offset, u32 *val)
>>>  {
>>> if (is_write)
>>> -   return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
>>> +   return vgic_uaccess_write(vcpu, dev, offset, val);
>>> else
>>> -   return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
>>> +   return vgic_uaccess_read(vcpu, dev, offset, val);
>>>  }
>>>  
>>>  static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device 
>>> *dev,
>> ___
>> kvmarm mailing list
>> kvm...@lists.cs.columbia.edu
>> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
> 



Re: [PATCH net-next 0/5] skbuff: introduce skbuff_heads bulking and reusing

2021-01-13 Thread Eric Dumazet
On Wed, Jan 13, 2021 at 6:03 PM Jakub Kicinski  wrote:
>
> On Wed, 13 Jan 2021 05:46:05 +0100 Eric Dumazet wrote:
> > On Wed, Jan 13, 2021 at 2:02 AM Jakub Kicinski  wrote:
> > >
> > > On Tue, 12 Jan 2021 13:23:16 +0100 Eric Dumazet wrote:
> > > > On Tue, Jan 12, 2021 at 12:08 PM Alexander Lobakin  
> > > > wrote:
> > > > >
> > > > > From: Edward Cree 
> > > > > Date: Tue, 12 Jan 2021 09:54:04 +
> > > > >
> > > > > > Without wishing to weigh in on whether this caching is a good 
> > > > > > idea...
> > > > >
> > > > > Well, we already have a cache to bulk flush "consumed" skbs, although
> > > > > kmem_cache_free() is generally lighter than kmem_cache_alloc(), and
> > > > > a page frag cache to allocate skb->head that is also bulking the
> > > > > operations, since it contains a (compound) page with the size of
> > > > > min(SZ_32K, PAGE_SIZE).
> > > > > If they wouldn't give any visible boosts, I think they wouldn't hit
> > > > > mainline.
> > > > >
> > > > > > Wouldn't it be simpler, rather than having two separate "alloc" and 
> > > > > > "flush"
> > > > > >  caches, to have a single larger cache, such that whenever it 
> > > > > > becomes full
> > > > > >  we bulk flush the top half, and when it's empty we bulk alloc the 
> > > > > > bottom
> > > > > >  half?  That should mean fewer branches, fewer instructions etc. 
> > > > > > than
> > > > > >  having to decide which cache to act upon every time.
> > > > >
> > > > > I though about a unified cache, but couldn't decide whether to flush
> > > > > or to allocate heads and how much to process. Your suggestion answers
> > > > > these questions and generally seems great. I'll try that one, thanks!
> > > >
> > > > The thing is : kmalloc() is supposed to have batches already, and nice
> > > > per-cpu caches.
> > > >
> > > > This looks like an mm issue, are we sure we want to get over it ?
> > > >
> > > > I would like a full analysis of why SLAB/SLUB does not work well for
> > > > your test workload.
> > >
> > > +1, it does feel like we're getting into mm territory
> >
> > I read the existing code, and with Edward Cree idea of reusing the
> > existing cache (storage of pointers),
> > ths now all makes sense, since there will be not much added code (and
> > new storage of 64 pointers)
> >
> > The remaining issue is to make sure KASAN will still work, we need
> > this to detect old and new bugs.
>
> IDK much about MM, but we already have a kmem_cache for skbs and now
> we're building a cache on top of a cache.  Shouldn't MM take care of
> providing a per-CPU BH-only lockless cache?

I think part of the improvement comes from bulk operations, which are
provided by mm layer.

I also note Alexander made no provision for NUMA awareness.
Probably reusing skb located on a remote node will not be ideal.


Re: [RFC PATCH v2 1/8] Use atomic type for ucounts reference counting

2021-01-13 Thread Eric W. Biederman
Alexey Gladkov  writes:

We might want to use refcount_t instead of atomic_t.  Not a big deal
either way.

> Signed-off-by: Alexey Gladkov 
> ---
>  include/linux/user_namespace.h |  2 +-
>  kernel/ucount.c| 10 +-
>  2 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 64cf8ebdc4ec..84fefa9247c4 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -92,7 +92,7 @@ struct ucounts {
>   struct hlist_node node;
>   struct user_namespace *ns;
>   kuid_t uid;
> - int count;
> + atomic_t count;
>   atomic_t ucount[UCOUNT_COUNTS];
>  };
>  
> diff --git a/kernel/ucount.c b/kernel/ucount.c
> index 11b1596e2542..0f2c7c11df19 100644
> --- a/kernel/ucount.c
> +++ b/kernel/ucount.c
> @@ -141,7 +141,8 @@ static struct ucounts *get_ucounts(struct user_namespace 
> *ns, kuid_t uid)
>  
>   new->ns = ns;
>   new->uid = uid;
> - new->count = 0;
> +
> + atomic_set(&new->count, 0);
>  
>   spin_lock_irq(&ucounts_lock);
>   ucounts = find_ucounts(ns, uid, hashent);
> @@ -152,10 +153,10 @@ static struct ucounts *get_ucounts(struct 
> user_namespace *ns, kuid_t uid)
>   ucounts = new;
>   }
>   }
> - if (ucounts->count == INT_MAX)
> + if (atomic_read(&ucounts->count) == INT_MAX)
>   ucounts = NULL;
>   else
> - ucounts->count += 1;
> + atomic_inc(&ucounts->count);
>   spin_unlock_irq(&ucounts_lock);
>   return ucounts;
>  }
> @@ -165,8 +166,7 @@ static void put_ucounts(struct ucounts *ucounts)
>   unsigned long flags;
>  
>   spin_lock_irqsave(&ucounts_lock, flags);
> - ucounts->count -= 1;
> - if (!ucounts->count)
> + if (atomic_dec_and_test(&ucounts->count))
>   hlist_del_init(&ucounts->node);
>   else
>   ucounts = NULL;


This can become:
static void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;

if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) 
{
hlist_del_init(&ucounts->node);
spin_unlock_irqrestore(&ucounts_lock);
kfree(ucounts);
}
}



Re: [RFC PATCH v2 2/8] Add a reference to ucounts for each user

2021-01-13 Thread Eric W. Biederman


The subject is wrong.  This should be:
[RFC PATCH v2 2/8] Add a reference to ucounts for each cred.

Further the explanation could use a little work.  Something along the
lines of:

For RLIMIT_NPROC and some other rlimits the user_struct that holds the
global limit is kept alive for the lifetime of a process by keeping it
in struct cred.  Add a ucounts reference to struct cred, so that
RLIMIT_NPROC can switch from using a per user limit to using a per user
per user namespace limit.

Nits about the code below.

Alexey Gladkov  writes:

> Before this, only the owner of the user namespace had an entry in ucounts.
> This entry addressed the user in the given user namespace.
>
> Now we create such an entry in ucounts for all users in the user namespace.
> Each user has only one entry for each user namespace.
>
> This commit is in preparation for migrating rlimits to ucounts.
>
> Signed-off-by: Alexey Gladkov 
> ---
>  include/linux/cred.h   |  1 +
>  include/linux/user_namespace.h |  2 ++
>  kernel/cred.c  | 17 +++--
>  kernel/ucount.c| 12 +++-
>  kernel/user_namespace.c|  1 +
>  5 files changed, 30 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/cred.h b/include/linux/cred.h
> index 18639c069263..307744fcc387 100644
> --- a/include/linux/cred.h
> +++ b/include/linux/cred.h
> @@ -144,6 +144,7 @@ struct cred {
>  #endif
>   struct user_struct *user;   /* real user ID subscription */
>   struct user_namespace *user_ns; /* user_ns the caps and keyrings are 
> relative to. */
> + struct ucounts *ucounts;
>   struct group_info *group_info;  /* supplementary groups for euid/fsgid 
> */
>   /* RCU deletion */
>   union {
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 84fefa9247c4..483568a56f7f 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -102,6 +102,8 @@ bool setup_userns_sysctls(struct user_namespace *ns);
>  void retire_userns_sysctls(struct user_namespace *ns);
>  struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum 
> ucount_type type);
>  void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
> +void put_ucounts(struct ucounts *ucounts);
> +void set_cred_ucounts(const struct cred *cred, struct user_namespace *ns, 
> kuid_t uid);
>  
>  #ifdef CONFIG_USER_NS
>  
> diff --git a/kernel/cred.c b/kernel/cred.c
> index 421b1149c651..d19e2e97092c 100644
> --- a/kernel/cred.c
> +++ b/kernel/cred.c
> @@ -119,6 +119,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
>   if (cred->group_info)
>   put_group_info(cred->group_info);
>   free_uid(cred->user);
> + put_ucounts(cred->ucounts);
>   put_user_ns(cred->user_ns);
>   kmem_cache_free(cred_jar, cred);
>  }
> @@ -144,6 +145,9 @@ void __put_cred(struct cred *cred)
>   BUG_ON(cred == current->cred);
>   BUG_ON(cred == current->real_cred);
>  
> + BUG_ON(cred->ucounts == NULL);
> + BUG_ON(cred->ucounts->ns != cred->user_ns);
> +
>   if (cred->non_rcu)
>   put_cred_rcu(&cred->rcu);
>   else
> @@ -271,6 +275,9 @@ struct cred *prepare_creds(void)
>   get_uid(new->user);
>   get_user_ns(new->user_ns);
>  
> + new->ucounts = NULL;
> + set_cred_ucounts(new, new->user_ns, new->euid);
> +
This hunk should be:
atomic_inc(&new->count);

That means you get to skip the lookup by uid and user_ns which while it
should be cheap is completely unnecessary in this case.

>  #ifdef CONFIG_KEYS
>   key_get(new->session_keyring);
>   key_get(new->process_keyring);
> @@ -363,6 +370,7 @@ int copy_creds(struct task_struct *p, unsigned long 
> clone_flags)
>   ret = create_user_ns(new);
>   if (ret < 0)
>   goto error_put;
> + set_cred_ucounts(new, new->user_ns, new->euid);
>   }
>  
>  #ifdef CONFIG_KEYS
> @@ -485,8 +493,11 @@ int commit_creds(struct cred *new)
>* in set_user().
>*/
>   alter_cred_subscribers(new, 2);
> - if (new->user != old->user)
> - atomic_inc(&new->user->processes);
> + if (new->user != old->user || new->user_ns != old->user_ns) {
> + if (new->user != old->user)
> + atomic_inc(&new->user->processes);
> + set_cred_ucounts(new, new->user_ns, new->euid);
> + }
>   rcu_assign_pointer(task->real_cred, new);
>   rcu_assign_pointer(task->cred, new);
>   if (new->user != old->user)
> @@ -661,6 +672,7 @@ void __init cred_init(void)
>   /* allocate a slab in which we can store credentials */
>   cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
>   SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
> + set_cred_ucounts(&init_cred, &init_user_ns, GLOBAL_ROOT_UID);
Unfortuantely this is needed here because this is the first cred
and there is no ucount

Re: [PATCH v13 00/15] SMMUv3 Nested Stage Setup (IOMMU part)

2021-01-13 Thread Auger Eric
Hi Shameer,

On 1/8/21 6:05 PM, Shameerali Kolothum Thodi wrote:
> Hi Eric,
> 
>> -Original Message-----
>> From: Eric Auger [mailto:eric.au...@redhat.com]
>> Sent: 18 November 2020 11:22
>> To: eric.auger@gmail.com; eric.au...@redhat.com;
>> io...@lists.linux-foundation.org; linux-kernel@vger.kernel.org;
>> k...@vger.kernel.org; kvm...@lists.cs.columbia.edu; w...@kernel.org;
>> j...@8bytes.org; m...@kernel.org; robin.mur...@arm.com;
>> alex.william...@redhat.com
>> Cc: jean-phili...@linaro.org; zhangfei@linaro.org;
>> zhangfei@gmail.com; vivek.gau...@arm.com; Shameerali Kolothum
>> Thodi ;
>> jacob.jun@linux.intel.com; yi.l@intel.com; t...@semihalf.com;
>> nicoleots...@gmail.com; yuzenghui 
>> Subject: [PATCH v13 00/15] SMMUv3 Nested Stage Setup (IOMMU part)
>>
>> This series brings the IOMMU part of HW nested paging support
>> in the SMMUv3. The VFIO part is submitted separately.
>>
>> The IOMMU API is extended to support 2 new API functionalities:
>> 1) pass the guest stage 1 configuration
>> 2) pass stage 1 MSI bindings
>>
>> Then those capabilities gets implemented in the SMMUv3 driver.
>>
>> The virtualizer passes information through the VFIO user API
>> which cascades them to the iommu subsystem. This allows the guest
>> to own stage 1 tables and context descriptors (so-called PASID
>> table) while the host owns stage 2 tables and main configuration
>> structures (STE).
> 
> I am seeing an issue with Guest testpmd run with this series.
> I have two different setups and testpmd works fine with the
> first one but not with the second.
> 
> 1). Guest doesn't have kernel driver built-in for pass-through dev.
> 
> root@ubuntu:/# lspci -v
> ...
> 00:02.0 Ethernet controller: Huawei Technologies Co., Ltd. Device a22e (rev 
> 21)
> Subsystem: Huawei Technologies Co., Ltd. Device 
> Flags: fast devsel
> Memory at 800010 (64-bit, prefetchable) [disabled] [size=64K]
> Memory at 80 (64-bit, prefetchable) [disabled] [size=1M]
> Capabilities: [40] Express Root Complex Integrated Endpoint, MSI 00
> Capabilities: [a0] MSI-X: Enable- Count=67 Masked-
> Capabilities: [b0] Power Management version 3
> Capabilities: [100] Access Control Services
> Capabilities: [300] Transaction Processing Hints
> 
> root@ubuntu:/# echo vfio-pci > 
> /sys/bus/pci/devices/:00:02.0/driver_override
> root@ubuntu:/# echo :00:02.0 > /sys/bus/pci/drivers_probe
> 
> root@ubuntu:/mnt/dpdk/build/app# ./testpmd -w :00:02.0 --file-prefix 
> socket0  -l 0-1 -n 2 -- -i
> EAL: Detected 8 lcore(s)
> EAL: Detected 1 NUMA nodes
> EAL: Multi-process socket /var/run/dpdk/socket0/mp_socket
> EAL: Selected IOVA mode 'VA'
> EAL: No available hugepages reported in hugepages-32768kB
> EAL: No available hugepages reported in hugepages-64kB
> EAL: No available hugepages reported in hugepages-1048576kB
> EAL: Probing VFIO support...
> EAL: VFIO support initialized
> EAL:   Invalid NUMA socket, default to 0
> EAL:   using IOMMU type 1 (Type 1)
> EAL: Probe PCI driver: net_hns3_vf (19e5:a22e) device: :00:02.0 (socket 0)
> EAL: No legacy callbacks, legacy socket not created
> Interactive-mode selected
> testpmd: create a new mbuf pool : n=155456, size=2176, 
> socket=0
> testpmd: preferred mempool ops selected: ring_mp_mc
> 
> Warning! port-topology=paired and odd forward ports number, the last port 
> will pair with itself.
> 
> Configuring Port 0 (socket 0)
> Port 0: 8E:A6:8C:43:43:45
> Checking link statuses...
> Done
> testpmd>
> 
> 2). Guest have kernel driver built-in for pass-through dev.
> 
> root@ubuntu:/# lspci -v
> ...
> 00:02.0 Ethernet controller: Huawei Technologies Co., Ltd. Device a22e (rev 
> 21)
> Subsystem: Huawei Technologies Co., Ltd. Device 
> Flags: bus master, fast devsel, latency 0
> Memory at 800010 (64-bit, prefetchable) [size=64K]
> Memory at 80 (64-bit, prefetchable) [size=1M]
> Capabilities: [40] Express Root Complex Integrated Endpoint, MSI 00
> Capabilities: [a0] MSI-X: Enable+ Count=67 Masked-
> Capabilities: [b0] Power Management version 3
> Capabilities: [100] Access Control Services
> Capabilities: [300] Transaction Processing Hints
> Kernel driver in use: hns3
> 
> root@ubuntu:/# echo vfio-pci > 
> /sys/bus/pci/devices/:00:02.0/driver_override
> root@ubuntu:/# echo :00:02.0 > /sys/bus/pci/drivers/hns3/unbind
> root@ubuntu:/# echo :00:02.0 > /sys/bus/pci/drivers_probe
> 
> root@ubuntu:/mnt/dpdk/build/app# ./testpmd -w :00:02.0 --file-prefix 
> socket0 -l 0-1 -n 2 -- -i
> EAL: Detected 8 lcore(s)
>

Re: [PATCH v2 net-next 2/3] skbuff: (re)use NAPI skb cache on allocation path

2021-01-13 Thread Eric Dumazet
On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin  wrote:
>
> Instead of calling kmem_cache_alloc() every time when building a NAPI
> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
> this cache was only used for bulk-freeing skbuff_heads consumed via
> napi_consume_skb() or __kfree_skb_defer().
>
> Typical path is:
>  - skb is queued for freeing from driver or stack, its skbuff_head
>goes into the cache instead of immediate freeing;
>  - driver or stack requests NAPI skb allocation, an skbuff_head is
>taken from the cache instead of allocation.
>
> Corner cases:
>  - if it's empty on skb allocation, bulk-allocate the first half;
>  - if it's full on skb consuming, bulk-wipe the second half.
>
> Also try to balance its size after completing network softirqs
> (__kfree_skb_flush()).

I do not see the point of doing this rebalance (especially if we do not change
its name describing its purpose more accurately).

For moderate load, we will have a reduced bulk size (typically one or two).
Number of skbs in the cache is in [0, 64[ , there is really no risk of
letting skbs there for a long period of time.
(32 * sizeof(sk_buff) = 8192)
I would personally get rid of this function completely.


Also it seems you missed my KASAN support request ?
I guess this is a matter of using kasan_unpoison_range(), we can ask for help.




>
> prefetchw() on CONFIG_SLUB is dropped since it makes no sense anymore.
>
> Suggested-by: Edward Cree 
> Signed-off-by: Alexander Lobakin 
> ---
>  net/core/skbuff.c | 54 ++-
>  1 file changed, 35 insertions(+), 19 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index dc3300dc2ac4..f42a3a04b918 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -364,6 +364,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
>  EXPORT_SYMBOL(build_skb_around);
>
>  #define NAPI_SKB_CACHE_SIZE64
> +#define NAPI_SKB_CACHE_HALF(NAPI_SKB_CACHE_SIZE / 2)
>
>  struct napi_alloc_cache {
> struct page_frag_cache page;
> @@ -487,7 +488,15 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
>
>  static struct sk_buff *napi_skb_cache_get(struct napi_alloc_cache *nc)
>  {
> -   return kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
> +   if (unlikely(!nc->skb_count))
> +   nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
> + GFP_ATOMIC,
> + NAPI_SKB_CACHE_HALF,
> + nc->skb_cache);
> +   if (unlikely(!nc->skb_count))
> +   return NULL;
> +
> +   return nc->skb_cache[--nc->skb_count];
>  }
>
>  /**
> @@ -867,40 +876,47 @@ void __consume_stateless_skb(struct sk_buff *skb)
>  void __kfree_skb_flush(void)
>  {
> struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
> +   size_t count;
> +   void **ptr;
> +
> +   if (unlikely(nc->skb_count == NAPI_SKB_CACHE_HALF))
> +   return;
> +
> +   if (nc->skb_count > NAPI_SKB_CACHE_HALF) {
> +   count = nc->skb_count - NAPI_SKB_CACHE_HALF;
> +   ptr = nc->skb_cache + NAPI_SKB_CACHE_HALF;
>
> -   /* flush skb_cache if containing objects */
> -   if (nc->skb_count) {
> -   kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
> -nc->skb_cache);
> -   nc->skb_count = 0;
> +   kmem_cache_free_bulk(skbuff_head_cache, count, ptr);
> +   nc->skb_count = NAPI_SKB_CACHE_HALF;
> +   } else {
> +   count = NAPI_SKB_CACHE_HALF - nc->skb_count;
> +   ptr = nc->skb_cache + nc->skb_count;
> +
> +   nc->skb_count += kmem_cache_alloc_bulk(skbuff_head_cache,
> +  GFP_ATOMIC, count,
> +  ptr);
> }
>  }
>


Re: [PATCH net-next 0/5] skbuff: introduce skbuff_heads bulking and reusing

2021-01-12 Thread Eric Dumazet
On Wed, Jan 13, 2021 at 2:02 AM Jakub Kicinski  wrote:
>
> On Tue, 12 Jan 2021 13:23:16 +0100 Eric Dumazet wrote:
> > On Tue, Jan 12, 2021 at 12:08 PM Alexander Lobakin  wrote:
> > >
> > > From: Edward Cree 
> > > Date: Tue, 12 Jan 2021 09:54:04 +
> > >
> > > > Without wishing to weigh in on whether this caching is a good idea...
> > >
> > > Well, we already have a cache to bulk flush "consumed" skbs, although
> > > kmem_cache_free() is generally lighter than kmem_cache_alloc(), and
> > > a page frag cache to allocate skb->head that is also bulking the
> > > operations, since it contains a (compound) page with the size of
> > > min(SZ_32K, PAGE_SIZE).
> > > If they wouldn't give any visible boosts, I think they wouldn't hit
> > > mainline.
> > >
> > > > Wouldn't it be simpler, rather than having two separate "alloc" and 
> > > > "flush"
> > > >  caches, to have a single larger cache, such that whenever it becomes 
> > > > full
> > > >  we bulk flush the top half, and when it's empty we bulk alloc the 
> > > > bottom
> > > >  half?  That should mean fewer branches, fewer instructions etc. than
> > > >  having to decide which cache to act upon every time.
> > >
> > > I though about a unified cache, but couldn't decide whether to flush
> > > or to allocate heads and how much to process. Your suggestion answers
> > > these questions and generally seems great. I'll try that one, thanks!
> >
> > The thing is : kmalloc() is supposed to have batches already, and nice
> > per-cpu caches.
> >
> > This looks like an mm issue, are we sure we want to get over it ?
> >
> > I would like a full analysis of why SLAB/SLUB does not work well for
> > your test workload.
>
> +1, it does feel like we're getting into mm territory

I read the existing code, and with Edward Cree idea of reusing the
existing cache (storage of pointers),
ths now all makes sense, since there will be not much added code (and
new storage of 64 pointers)

The remaining issue is to make sure KASAN will still work, we need
this to detect old and new bugs.

Thanks !


Re: [PATCH] tcp: keepalive fixes

2021-01-12 Thread Eric Dumazet
On Tue, Jan 12, 2021 at 11:48 PM Yuchung Cheng  wrote:
>
> On Tue, Jan 12, 2021 at 2:31 PM Enke Chen  wrote:
> >
> > From: Enke Chen 
> >
> > In this patch two issues with TCP keepalives are fixed:
> >
> > 1) TCP keepalive does not timeout when there are data waiting to be
> >delivered and then the connection got broken. The TCP keepalive
> >timeout is not evaluated in that condition.
> hi enke
> Do you have an example to demonstrate this issue -- in theory when
> there is data inflight, an RTO timer should be pending (which
> considers user-timeout setting). based on the user-timeout description
> (man tcp), the user timeout should abort the socket per the specified
> time after data commences. some data would help to understand the
> issue.
>

+1

A packetdrill test would be ideal.

Also, given that there is this ongoing issue with TCP_USER_TIMEOUT,
lets not mix things
or risk added work for backports to stable versions.


[PATCH RESEND] random: fix the RNDRESEEDCRNG ioctl

2021-01-12 Thread Eric Biggers
From: Eric Biggers 

The RNDRESEEDCRNG ioctl reseeds the primary_crng from itself, which
doesn't make sense.  Reseed it from the input_pool instead.

Fixes: d848e5f8e1eb ("random: add new ioctl RNDRESEEDCRNG")
Cc: sta...@vger.kernel.org
Cc: linux-cry...@vger.kernel.org
Cc: Andy Lutomirski 
Cc: Jann Horn 
Cc: Theodore Ts'o 
Reviewed-by: Jann Horn 
Signed-off-by: Eric Biggers 
---

Andrew, please consider taking this patch since the maintainer has been
ignoring it for 4 months
(https://lkml.kernel.org/lkml/20200916041908.66649-1-ebigg...@kernel.org/T/#u).


 drivers/char/random.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 5f3b8ac9d97b0..a894c0559a8cf 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1972,7 +1972,7 @@ static long random_ioctl(struct file *f, unsigned int 
cmd, unsigned long arg)
return -EPERM;
if (crng_init < 2)
return -ENODATA;
-   crng_reseed(&primary_crng, NULL);
+   crng_reseed(&primary_crng, &input_pool);
crng_global_init_time = jiffies - 1;
return 0;
default:
-- 
2.30.0



[PATCH RESEND] random: remove dead code left over from blocking pool

2021-01-12 Thread Eric Biggers
From: Eric Biggers 

Remove some dead code that was left over following commit 90ea1c6436d2
("random: remove the blocking pool").

Cc: linux-cry...@vger.kernel.org
Cc: Andy Lutomirski 
Cc: Jann Horn 
Cc: Theodore Ts'o 
Reviewed-by: Andy Lutomirski 
Signed-off-by: Eric Biggers 
---

Andrew, please consider taking this patch since the maintainer has been
ignoring it for 4 months
(https://lkml.kernel.org/lkml/20200916043652.96640-1-ebigg...@kernel.org/T/#u).


 drivers/char/random.c | 17 ++-
 include/trace/events/random.h | 83 ---
 2 files changed, 3 insertions(+), 97 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index a894c0559a8cf..bbc5098b1a81f 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -500,7 +500,6 @@ struct entropy_store {
unsigned short add_ptr;
unsigned short input_rotate;
int entropy_count;
-   unsigned int initialized:1;
unsigned int last_data_init:1;
__u8 last_data[EXTRACT_SIZE];
 };
@@ -660,7 +659,7 @@ static void process_random_ready_list(void)
  */
 static void credit_entropy_bits(struct entropy_store *r, int nbits)
 {
-   int entropy_count, orig, has_initialized = 0;
+   int entropy_count, orig;
const int pool_size = r->poolinfo->poolfracbits;
int nfrac = nbits << ENTROPY_SHIFT;
 
@@ -717,23 +716,14 @@ static void credit_entropy_bits(struct entropy_store *r, 
int nbits)
if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig)
goto retry;
 
-   if (has_initialized) {
-   r->initialized = 1;
-   kill_fasync(&fasync, SIGIO, POLL_IN);
-   }
-
trace_credit_entropy_bits(r->name, nbits,
  entropy_count >> ENTROPY_SHIFT, _RET_IP_);
 
if (r == &input_pool) {
int entropy_bits = entropy_count >> ENTROPY_SHIFT;
 
-   if (crng_init < 2) {
-   if (entropy_bits < 128)
-   return;
+   if (crng_init < 2 && entropy_bits >= 128)
crng_reseed(&primary_crng, r);
-   entropy_bits = ENTROPY_BITS(r);
-   }
}
 }
 
@@ -1385,8 +1375,7 @@ static size_t account(struct entropy_store *r, size_t 
nbytes, int min,
 }
 
 /*
- * This function does the actual extraction for extract_entropy and
- * extract_entropy_user.
+ * This function does the actual extraction for extract_entropy.
  *
  * Note: we assume that .poolwords is a multiple of 16 words.
  */
diff --git a/include/trace/events/random.h b/include/trace/events/random.h
index 9570a10cb949b..3d7b432ca5f31 100644
--- a/include/trace/events/random.h
+++ b/include/trace/events/random.h
@@ -85,28 +85,6 @@ TRACE_EVENT(credit_entropy_bits,
  __entry->entropy_count, (void *)__entry->IP)
 );
 
-TRACE_EVENT(push_to_pool,
-   TP_PROTO(const char *pool_name, int pool_bits, int input_bits),
-
-   TP_ARGS(pool_name, pool_bits, input_bits),
-
-   TP_STRUCT__entry(
-   __field( const char *,  pool_name   )
-   __field(  int,  pool_bits   )
-   __field(  int,  input_bits  )
-   ),
-
-   TP_fast_assign(
-   __entry->pool_name  = pool_name;
-   __entry->pool_bits  = pool_bits;
-   __entry->input_bits = input_bits;
-   ),
-
-   TP_printk("%s: pool_bits %d input_pool_bits %d",
- __entry->pool_name, __entry->pool_bits,
- __entry->input_bits)
-);
-
 TRACE_EVENT(debit_entropy,
TP_PROTO(const char *pool_name, int debit_bits),
 
@@ -161,35 +139,6 @@ TRACE_EVENT(add_disk_randomness,
  MINOR(__entry->dev), __entry->input_bits)
 );
 
-TRACE_EVENT(xfer_secondary_pool,
-   TP_PROTO(const char *pool_name, int xfer_bits, int request_bits,
-int pool_entropy, int input_entropy),
-
-   TP_ARGS(pool_name, xfer_bits, request_bits, pool_entropy,
-   input_entropy),
-
-   TP_STRUCT__entry(
-   __field( const char *,  pool_name   )
-   __field(  int,  xfer_bits   )
-   __field(  int,  request_bits)
-   __field(  int,  pool_entropy)
-   __field(  int,  input_entropy   )
-   ),
-
-   TP_fast_assign(
-   __entry->pool_name  = pool_name;
-   __entry->xfer_bits  = xfer_bits;
-   __entry->request_bits   = request_bits;
-   __entry->pool_entropy   = pool_entropy;
-   __entry->input_entropy  = input_entropy;
-   ),
-
-   TP_printk("pool %s xfer_bits %d request

[PATCH RESEND] random: initialize ChaCha20 constants with correct endianness

2021-01-12 Thread Eric Biggers
From: Eric Biggers 

On big endian CPUs, the ChaCha20-based CRNG is using the wrong
endianness for the ChaCha20 constants.

This doesn't matter cryptographically, but technically it means it's not
ChaCha20 anymore.  Fix it to always use the standard constants.

Cc: linux-cry...@vger.kernel.org
Cc: Andy Lutomirski 
Cc: Jann Horn 
Cc: Theodore Ts'o 
Acked-by: Herbert Xu 
Signed-off-by: Eric Biggers 
---

Andrew, please consider taking this patch since the maintainer has been
ignoring it for 4 months
(https://lkml.kernel.org/lkml/20200916045013.142179-1-ebigg...@kernel.org/T/#u).


 drivers/char/random.c   | 4 ++--
 include/crypto/chacha.h | 9 +++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index bbc5098b1a81f..4037a1e0fb748 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -809,7 +809,7 @@ static bool __init crng_init_try_arch_early(struct 
crng_state *crng)
 
 static void __maybe_unused crng_initialize_secondary(struct crng_state *crng)
 {
-   memcpy(&crng->state[0], "expand 32-byte k", 16);
+   chacha_init_consts(crng->state);
_get_random_bytes(&crng->state[4], sizeof(__u32) * 12);
crng_init_try_arch(crng);
crng->init_time = jiffies - CRNG_RESEED_INTERVAL - 1;
@@ -817,7 +817,7 @@ static void __maybe_unused crng_initialize_secondary(struct 
crng_state *crng)
 
 static void __init crng_initialize_primary(struct crng_state *crng)
 {
-   memcpy(&crng->state[0], "expand 32-byte k", 16);
+   chacha_init_consts(crng->state);
_extract_entropy(&input_pool, &crng->state[4], sizeof(__u32) * 12, 0);
if (crng_init_try_arch_early(crng) && trust_cpu) {
invalidate_batched_entropy();
diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h
index 3a1c72fdb7cf5..dabaee6987186 100644
--- a/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
@@ -47,13 +47,18 @@ static inline void hchacha_block(const u32 *state, u32 
*out, int nrounds)
hchacha_block_generic(state, out, nrounds);
 }
 
-void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv);
-static inline void chacha_init_generic(u32 *state, const u32 *key, const u8 
*iv)
+static inline void chacha_init_consts(u32 *state)
 {
state[0]  = 0x61707865; /* "expa" */
state[1]  = 0x3320646e; /* "nd 3" */
state[2]  = 0x79622d32; /* "2-by" */
state[3]  = 0x6b206574; /* "te k" */
+}
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv);
+static inline void chacha_init_generic(u32 *state, const u32 *key, const u8 
*iv)
+{
+   chacha_init_consts(state);
state[4]  = key[0];
state[5]  = key[1];
state[6]  = key[2];
-- 
2.30.0



Re: [PATCH net-next 0/5] skbuff: introduce skbuff_heads bulking and reusing

2021-01-12 Thread Eric Dumazet
On Tue, Jan 12, 2021 at 7:26 PM Alexander Lobakin  wrote:
>
> From: Eric Dumazet 
> Date: Tue, 12 Jan 2021 13:32:56 +0100
>
> > On Tue, Jan 12, 2021 at 11:56 AM Alexander Lobakin  wrote:
> >>
> >
> >>
> >> Ah, I should've mentioned that I use UDP GRO Fraglists, so these
> >> numbers are for GRO.
> >>
> >
> > Right, this suggests UDP GRO fraglist is a pathological case of GRO,
> > not saving memory.
> >
> > Real GRO (TCP in most cases) will consume one skb, and have page
> > fragments for each segment.
> >
> > Having skbs linked together is not cache friendly.
>
> OK, so I rebased test setup a bit to clarify the things out.
>
> I disabled fraglists and GRO/GSO fraglists support advertisement
> in driver to exclude any "pathological" cases and switched it
> from napi_get_frags() + napi_gro_frags() to napi_alloc_skb() +
> napi_gro_receive() to disable local skb reusing (napi_reuse_skb()).
> I also enabled GSO UDP L4 ("classic" one: one skbuff_head + frags)
> for forwarding, not only local traffic, and disabled NF flow offload
> to increase CPU loading and drop performance below link speed so I
> could see the changes.
>
> So, the traffic flows looked like:
>  - TCP GRO (one head + frags) -> NAT -> hardware TSO;
>  - UDP GRO (one head + frags) -> NAT -> driver-side GSO.
>
> Baseline 5.11-rc3:
>  - 865 Mbps TCP, 866 Mbps UDP.
>
> This patch (both separate caches and Edward's unified cache):
>  - 899 Mbps TCP, 893 Mbps UDP.
>
> So that's cleary *not* only "pathological" UDP GRO Fraglists
> "problem" as TCP also got ~35 Mbps from this, as well as
> non-fraglisted UDP.
>
> Regarding latencies: I remember there were talks about latencies when
> Edward introduced batched GRO (using linked lists to pass skbs from
> GRO layer to core stack instead of passing one by one), so I think
> it's a perennial question when it comes to batching/caching.
>
> Thanks for the feedback, will post v2 soon.
> The question about if this caching is reasonable isn't closed anyway,
> but I don't see significant "cons" for now.
>

Also it would be nice to have KASAN support.

We do not want to unconditionally to recycle stuff, since this might
hide use-after-free.


Re: [PATCH v4] certs: Add EFI_CERT_X509_GUID support for dbx entries

2021-01-12 Thread Eric Snowberg


> On Jan 12, 2021, at 10:10 AM, David Howells  wrote:
> 
> How about the attached?

This looks good to me.

> I've changed the function names to something that I
> think reads better, but otherwise it's the same.

I agree, the function name changes you made sound better.

We are starting to see platforms with KEK signed DBX updates containing
certs like this following boothole, so it would be great if we could get
something like this in.  I also had a follow on series that allowed these
certs to be compiled into the kernel.

https://lkml.org/lkml/2020/9/30/1301

I’d appreciate any feedback on that series as well.

Thanks

> David
> ---
> commit 8913866babb96fcfe452aac6042ca8862d4c0b53
> Author: Eric Snowberg 
> Date:   Tue Sep 15 20:49:27 2020 -0400
> 
>certs: Add EFI_CERT_X509_GUID support for dbx entries
> 
>The Secure Boot Forbidden Signature Database, dbx, contains a list of now
>revoked signatures and keys previously approved to boot with UEFI Secure
>Boot enabled.  The dbx is capable of containing any number of
>EFI_CERT_X509_SHA256_GUID, EFI_CERT_SHA256_GUID, and EFI_CERT_X509_GUID
>entries.
> 
>Currently when EFI_CERT_X509_GUID are contained in the dbx, the entries are
>skipped.
> 
>Add support for EFI_CERT_X509_GUID dbx entries. When a EFI_CERT_X509_GUID
>is found, it is added as an asymmetrical key to the .blacklist keyring.
>Anytime the .platform keyring is used, the keys in the .blacklist keyring
>are referenced, if a matching key is found, the key will be rejected.
> 
>Signed-off-by: Eric Snowberg 
>Reviewed-by: Jarkko Sakkinen 
>Signed-off-by: David Howells 
> 
> diff --git a/certs/blacklist.c b/certs/blacklist.c
> index 6514f9ebc943..a7f021878a4b 100644
> --- a/certs/blacklist.c
> +++ b/certs/blacklist.c
> @@ -100,6 +100,38 @@ int mark_hash_blacklisted(const char *hash)
>   return 0;
> }
> 
> +int add_key_to_revocation_list(const char *data, size_t size)
> +{
> + key_ref_t key;
> +
> + key = key_create_or_update(make_key_ref(blacklist_keyring, true),
> +"asymmetric",
> +NULL,
> +data,
> +size,
> +((KEY_POS_ALL & ~KEY_POS_SETATTR) | 
> KEY_USR_VIEW),
> +KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN);
> +
> + if (IS_ERR(key)) {
> + pr_err("Problem with revocation key (%ld)\n", PTR_ERR(key));
> + return PTR_ERR(key);
> + }
> +
> + return 0;
> +}
> +
> +int is_key_on_revocation_list(struct pkcs7_message *pkcs7)
> +{
> + int ret;
> +
> + ret = validate_trust(pkcs7, blacklist_keyring);
> +
> + if (ret == 0)
> + return -EKEYREJECTED;
> +
> + return -ENOKEY;
> +}
> +
> /**
>  * is_hash_blacklisted - Determine if a hash is blacklisted
>  * @hash: The hash to be checked as a binary blob
> diff --git a/certs/blacklist.h b/certs/blacklist.h
> index 1efd6fa0dc60..420bb7c86e07 100644
> --- a/certs/blacklist.h
> +++ b/certs/blacklist.h
> @@ -1,3 +1,15 @@
> #include 
> +#include 
> +#include 
> 
> extern const char __initconst *const blacklist_hashes[];
> +
> +#ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING
> +#define validate_trust pkcs7_validate_trust
> +#else
> +static inline int validate_trust(struct pkcs7_message *pkcs7,
> +  struct key *trust_keyring)
> +{
> + return -ENOKEY;
> +}
> +#endif
> diff --git a/certs/system_keyring.c b/certs/system_keyring.c
> index 798291177186..cc165b359ea3 100644
> --- a/certs/system_keyring.c
> +++ b/certs/system_keyring.c
> @@ -241,6 +241,12 @@ int verify_pkcs7_message_sig(const void *data, size_t 
> len,
>   pr_devel("PKCS#7 platform keyring is not available\n");
>   goto error;
>   }
> +
> + ret = is_key_on_revocation_list(pkcs7);
> + if (ret != -ENOKEY) {
> + pr_devel("PKCS#7 platform key is on revocation list\n");
> + goto error;
> + }
>   }
>   ret = pkcs7_validate_trust(pkcs7, trusted_keys);
>   if (ret < 0) {
> diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
> index fb8b07daa9d1..61f98739e8b1 100644
> --- a/include/keys/system_keyring.h
> +++ b/include/keys/system_keyring.h
> @@ -31,11 +31,14 @@ extern int restrict_link_by_builtin_and_secondary_trusted(
> #define restrict_link_by_builtin_and_secondary_trusted 
> restrict_link_by_builtin_trus

Re: [PATCH v2 01/10] vfs: move cap_convert_nscap() call into vfs_setxattr()

2021-01-12 Thread Eric W. Biederman
ebied...@xmission.com (Eric W. Biederman) writes:

> So there is the basic question do we want to read the raw bytes on disk
> or do we want to return something meaningful to the reader.  As the
> existing tools use the xattr interface to set/clear fscaps returning
> data to user space rather than raw bytes seems the perfered interface.
>
> My ideal semantics would be:
>
> - If current_user_ns() == sb->s_user_ns return the raw data.
>
>   I don't know how to implement this first scenario while permitting
>   stacked filesystems.

After a little more thought I do.

In getxattr if the get_cap method is not implemented by the
filesystem if current_user_ns() == sb->s_user_ns simply treat it as
an ordinary xattr read/write.

Otherwise call vfs_get_cap and translate the result as described
below.

The key point of this is it allows for seeing what is actually on
disk (when it is not confusing).

> - Calculate the cpu_vfs_cap_data as get_vfs_caps_from_disk does.
>   That gives the meaning of the xattr.
>
> - If "from_kuid(current_userns(), krootid) == 0" return a v2 cap.
>
> - If "rootid_owns_currentns()" return a v2 cap.
>
> - Else return an error.  Probably a permission error.
>
>   The fscap simply can not make sense to the user if the rootid does not
>   map.  Return a v2 cap would imply that the caps are present on the
>   executable (in the current context) which they are not.


Eric


Re: [PATCH v2 01/10] vfs: move cap_convert_nscap() call into vfs_setxattr()

2021-01-12 Thread Eric W. Biederman
Miklos Szeredi  writes:

> On Tue, Jan 12, 2021 at 1:15 AM Eric W. Biederman  
> wrote:
>>
>> Miklos Szeredi  writes:
>>
>> > On Fri, Jan 01, 2021 at 11:35:16AM -0600, Eric W. Biederman wrote:
>
>> > For one: a v2 fscap is supposed to be equivalent to a v3 fscap with a 
>> > rootid of
>> > zero, right?
>>
>> Yes.  This assumes that everything is translated into the uids of the
>> target filesystem.
>>
>> > If so, why does cap_inode_getsecurity() treat them differently (v2 fscap
>> > succeeding unconditionally while v3 one being either converted to v2, 
>> > rejected
>> > or left as v3 depending on current_user_ns())?
>>
>> As I understand it v2 fscaps have always succeeded unconditionally.  The
>> only case I can see for a v2 fscap might not succeed when read is if the
>> filesystem is outside of the initial user namespace.
>
> Looking again, it's rather confusing.  cap_inode_getsecurity()
> currently handles the following cases:
>
> v1: -> fails with -EINVAL
>
> v2: -> returns unconverted xattr
>
> v3:
>  a) rootid is mapped in the current namespace to non-zero:
>  -> convert rootid
>
>  b) rootid owns the current or ancerstor namespace:
>  -> convert to v2
>
>  c) rootid is not mapped and is not owner:
>  -> return -EOPNOTSUPP -> falls back to unconverted v3
>
> So lets take the example, where a tmpfs is created in a private user
> namespace and one file has a v2 cap and the other an equivalent v3 cap
> with a zero rootid.  This is the result when looking at it from
>
> 1) the namespace of the fs:
> ---
> t = cap_dac_override+eip
> tt = cap_dac_override+eip
>
> 2) the initial namespace:
> ---
> t = cap_dac_override+eip
> tt = cap_dac_override+eip [rootid=1000]
>
> 3) an unrelated namespace:
> ---
> t = cap_dac_override+eip
> tt = cap_dac_override+eip
>
> Note: in this last case getxattr will actually return a v3 cap with
> zero rootid for "tt" which getcap does not display due to being zero.
> I could do a setup with a nested namespaces that better demonstrate
> the confusing nature of this, but I think this also proves the point.

Yes.  There is real confusion on the reading case when the namespaces
do not match.

> At this point userspace simply cannot determine whether the returned
> cap is in any way valid or not.
>
> The following semantics would make a ton more sense, since getting a
> v2 would indicate that rootid is unknown:

> - if cap is v2 convert to v3 with zero rootid
> - after this, check if rootid needs to be translated, if not return v3
> - if yes, try to translate to current ns, if succeeds return translated v3
> - if not mappable, return v2
>
> Hmm?

So there is the basic question do we want to read the raw bytes on disk
or do we want to return something meaningful to the reader.  As the
existing tools use the xattr interface to set/clear fscaps returning
data to user space rather than raw bytes seems the perfered interface.

My ideal semantics would be:

- If current_user_ns() == sb->s_user_ns return the raw data.

  I don't know how to implement this first scenario while permitting
  stacked filesystems.
  
- Calculate the cpu_vfs_cap_data as get_vfs_caps_from_disk does.
  That gives the meaning of the xattr.

- If "from_kuid(current_userns(), krootid) == 0" return a v2 cap.

- If "rootid_owns_currentns()" return a v2 cap.

- Else return an error.  Probably a permission error.

  The fscap simply can not make sense to the user if the rootid does not
  map.  Return a v2 cap would imply that the caps are present on the
  executable (in the current context) which they are not.


>> > Anyway, here's a patch that I think fixes getxattr() layering for
>> > security.capability.  Does basically what you suggested.  Slight change of
>> > semantics vs. v1 caps, not sure if that is still needed, 
>> > getxattr()/setxattr()
>> > hasn't worked for these since the introduction of v3 in 4.14.
>> > Untested.
>>
>> Taking a look.  The goal of change how these operate is to make it so
>> that layered filesystems can just pass through the data if they don't
>> want to change anything (even with the user namespaces of the
>> filesystems in question are different).
>>
>> Feedback on the code below:
>> - cap_get should be in inode_operations like get_acl and set_acl.
>
> So it's not clear to me why xattr ops are per-sb and acl ops are per-inode.

I don't know why either.  What I do see is everythi

Re: [PATCH net-next 0/5] skbuff: introduce skbuff_heads bulking and reusing

2021-01-12 Thread Eric Dumazet
On Tue, Jan 12, 2021 at 11:56 AM Alexander Lobakin  wrote:
>

>
> Ah, I should've mentioned that I use UDP GRO Fraglists, so these
> numbers are for GRO.
>

Right, this suggests UDP GRO fraglist is a pathological case of GRO,
not saving memory.

Real GRO (TCP in most cases) will consume one skb, and have page
fragments for each segment.

Having skbs linked together is not cache friendly.

So I would try first to make this case better, instead of trying to
work around the real issue.


Re: [PATCH net-next 0/5] skbuff: introduce skbuff_heads bulking and reusing

2021-01-12 Thread Eric Dumazet
On Tue, Jan 12, 2021 at 12:08 PM Alexander Lobakin  wrote:
>
> From: Edward Cree 
> Date: Tue, 12 Jan 2021 09:54:04 +
>
> > Without wishing to weigh in on whether this caching is a good idea...
>
> Well, we already have a cache to bulk flush "consumed" skbs, although
> kmem_cache_free() is generally lighter than kmem_cache_alloc(), and
> a page frag cache to allocate skb->head that is also bulking the
> operations, since it contains a (compound) page with the size of
> min(SZ_32K, PAGE_SIZE).
> If they wouldn't give any visible boosts, I think they wouldn't hit
> mainline.
>
> > Wouldn't it be simpler, rather than having two separate "alloc" and "flush"
> >  caches, to have a single larger cache, such that whenever it becomes full
> >  we bulk flush the top half, and when it's empty we bulk alloc the bottom
> >  half?  That should mean fewer branches, fewer instructions etc. than
> >  having to decide which cache to act upon every time.
>
> I though about a unified cache, but couldn't decide whether to flush
> or to allocate heads and how much to process. Your suggestion answers
> these questions and generally seems great. I'll try that one, thanks!
>


The thing is : kmalloc() is supposed to have batches already, and nice
per-cpu caches.

This looks like an mm issue, are we sure we want to get over it ?

I would like a full analysis of why SLAB/SLUB does not work well for
your test workload.

More details, more numbers before we accept yet another
'networking optimization' adding more code to the 'fast' path.

More code means more latencies when all code needs to be brought up in
cpu caches.


Re: [RFC v3 2/2] vfio/platform: msi: add Broadcom platform devices

2021-01-12 Thread Auger Eric
Hi Vikas,

On 12/14/20 6:45 PM, Vikas Gupta wrote:
> Add msi support for Broadcom platform devices
> 
> Signed-off-by: Vikas Gupta 
> ---
>  drivers/vfio/platform/Kconfig |  1 +
>  drivers/vfio/platform/Makefile|  1 +
>  drivers/vfio/platform/msi/Kconfig |  9 
>  drivers/vfio/platform/msi/Makefile|  2 +
>  .../vfio/platform/msi/vfio_platform_bcmplt.c  | 49 +++
>  5 files changed, 62 insertions(+)
>  create mode 100644 drivers/vfio/platform/msi/Kconfig
>  create mode 100644 drivers/vfio/platform/msi/Makefile
>  create mode 100644 drivers/vfio/platform/msi/vfio_platform_bcmplt.c
what does plt mean?
> 
> diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
> index dc1a3c44f2c6..7b8696febe61 100644
> --- a/drivers/vfio/platform/Kconfig
> +++ b/drivers/vfio/platform/Kconfig
> @@ -21,3 +21,4 @@ config VFIO_AMBA
> If you don't know what to do here, say N.
>  
>  source "drivers/vfio/platform/reset/Kconfig"
> +source "drivers/vfio/platform/msi/Kconfig"
> diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
> index 3f3a24e7c4ef..9ccdcdbf0e7e 100644
> --- a/drivers/vfio/platform/Makefile
> +++ b/drivers/vfio/platform/Makefile
> @@ -5,6 +5,7 @@ vfio-platform-y := vfio_platform.o
>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform-base.o
>  obj-$(CONFIG_VFIO_PLATFORM) += reset/
> +obj-$(CONFIG_VFIO_PLATFORM) += msi/
>  
>  vfio-amba-y := vfio_amba.o
>  
> diff --git a/drivers/vfio/platform/msi/Kconfig 
> b/drivers/vfio/platform/msi/Kconfig
> new file mode 100644
> index ..54d6b70e1e32
> --- /dev/null
> +++ b/drivers/vfio/platform/msi/Kconfig
> @@ -0,0 +1,9 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +config VFIO_PLATFORM_BCMPLT_MSI
> + tristate "MSI support for Broadcom platform devices"
> + depends on VFIO_PLATFORM && (ARCH_BCM_IPROC || COMPILE_TEST)
> + default ARCH_BCM_IPROC
> + help
> +   Enables the VFIO platform driver to handle msi for Broadcom devices
> +
> +   If you don't know what to do here, say N.
> diff --git a/drivers/vfio/platform/msi/Makefile 
> b/drivers/vfio/platform/msi/Makefile
> new file mode 100644
> index ..27422d45cecb
> --- /dev/null
> +++ b/drivers/vfio/platform/msi/Makefile
> @@ -0,0 +1,2 @@
> +# SPDX-License-Identifier: GPL-2.0
> +obj-$(CONFIG_VFIO_PLATFORM_BCMPLT_MSI) += vfio_platform_bcmplt.o
> diff --git a/drivers/vfio/platform/msi/vfio_platform_bcmplt.c 
> b/drivers/vfio/platform/msi/vfio_platform_bcmplt.c
> new file mode 100644
> index ..a074b5e92d77
> --- /dev/null
> +++ b/drivers/vfio/platform/msi/vfio_platform_bcmplt.c
> @@ -0,0 +1,49 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2020 Broadcom.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "../vfio_platform_private.h"
> +
> +#define RING_SIZE(64 << 10)
> +
> +#define RING_MSI_ADDR_LS 0x03c
> +#define RING_MSI_ADDR_MS 0x040
> +#define RING_MSI_DATA_VALUE  0x064
Those 3 defines would not be needed anymore with that implementation option.
> +
> +static u32 bcm_num_msi(struct vfio_platform_device *vdev)
> +{
> + struct vfio_platform_region *reg = &vdev->regions[0];
> +
> + return (reg->size / RING_SIZE);
> +}
> +
> +static struct vfio_platform_msi_node vfio_platform_bcmflexrm_msi_node = {
> + .owner = THIS_MODULE,
> + .compat = "brcm,iproc-flexrm-mbox",
> + .of_get_msi = bcm_num_msi,
> +};
> +
> +static int __init vfio_platform_bcmflexrm_msi_module_init(void)
> +{
> + __vfio_platform_register_msi(&vfio_platform_bcmflexrm_msi_node);
> +
> + return 0;
> +}
> +
> +static void __exit vfio_platform_bcmflexrm_msi_module_exit(void)
> +{
> + vfio_platform_unregister_msi("brcm,iproc-flexrm-mbox");
> +}
> +
> +module_init(vfio_platform_bcmflexrm_msi_module_init);
> +module_exit(vfio_platform_bcmflexrm_msi_module_exit);
One thing I would like to discuss with Alex.

As the reset module is mandated (except if reset_required is forced to
0), I am wondering if we shouldn't try to turn the reset module into a
"specialization" module and put the msi hooks there. I am afraid we may
end up having modules for each and every vfio platform feature
specialization. At the moment that's fully bearable but I can't predict
what's next.

As the mandated feature is the reset capability maybe we could just keep
the config/module name terminology, tune the kconfig help message to
mention the msi support in case of flex-rm?

What do you think?

Thanks

Eric




> +
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Broadcom");
> 



Re: [RFC v3 1/2] vfio/platform: add support for msi

2021-01-12 Thread Auger Eric
Hi Vikas,

On 1/5/21 6:53 AM, Vikas Gupta wrote:
> On Tue, Dec 22, 2020 at 10:57 PM Auger Eric  wrote:
>>
>> Hi Vikas,
>>
>> On 12/14/20 6:45 PM, Vikas Gupta wrote:
>>> MSI support for platform devices.The MSI block
>>> is added as an extended IRQ which exports caps
>>> VFIO_IRQ_INFO_CAP_TYPE and VFIO_IRQ_INFO_CAP_MSI_DESCS.
>>>
>>> Signed-off-by: Vikas Gupta 
>>> ---
>>>  drivers/vfio/platform/vfio_platform_common.c  | 179 +++-
>>>  drivers/vfio/platform/vfio_platform_irq.c | 260 +-
>>>  drivers/vfio/platform/vfio_platform_private.h |  32 +++
>>>  include/uapi/linux/vfio.h |  44 +++
>>>  4 files changed, 496 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/drivers/vfio/platform/vfio_platform_common.c 
>>> b/drivers/vfio/platform/vfio_platform_common.c
>>> index fb4b385191f2..c936852f35d7 100644
>>> --- a/drivers/vfio/platform/vfio_platform_common.c
>>> +++ b/drivers/vfio/platform/vfio_platform_common.c
>>> @@ -16,6 +16,7 @@
>>>  #include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>
>>>  #include "vfio_platform_private.h"
>>>
>>> @@ -26,6 +27,8 @@
>>>  #define VFIO_PLATFORM_IS_ACPI(vdev) ((vdev)->acpihid != NULL)
>>>
>>>  static LIST_HEAD(reset_list);
>>> +/* devices having MSI support */
>> nit: for devices using MSIs?
>>> +static LIST_HEAD(msi_list);
>>>  static DEFINE_MUTEX(driver_lock);
>>>
>>>  static vfio_platform_reset_fn_t vfio_platform_lookup_reset(const char 
>>> *compat,
>>> @@ -47,6 +50,25 @@ static vfio_platform_reset_fn_t 
>>> vfio_platform_lookup_reset(const char *compat,
>>>   return reset_fn;
>>>  }
>>>
>>> +static bool vfio_platform_lookup_msi(struct vfio_platform_device *vdev)
>>> +{
>>> + bool has_msi = false;
>>> + struct vfio_platform_msi_node *iter;
>>> +
>>> + mutex_lock(&driver_lock);
>>> + list_for_each_entry(iter, &msi_list, link) {
>>> + if (!strcmp(iter->compat, vdev->compat) &&
>>> + try_module_get(iter->owner)) {
>>> + vdev->msi_module = iter->owner;
>>> + vdev->of_get_msi = iter->of_get_msi;
>>> + has_msi = true;
>>> + break;
>>> + }
>>> + }
>>> + mutex_unlock(&driver_lock);
>>> + return has_msi;
>>> +}
>>> +
>>>  static int vfio_platform_acpi_probe(struct vfio_platform_device *vdev,
>>>   struct device *dev)
>>>  {
>>> @@ -126,6 +148,19 @@ static int vfio_platform_get_reset(struct 
>>> vfio_platform_device *vdev)
>>>   return vdev->of_reset ? 0 : -ENOENT;
>>>  }
>>>
>>> +static int vfio_platform_get_msi(struct vfio_platform_device *vdev)
>>> +{
>>> + bool has_msi;
>>> +
>>> + has_msi = vfio_platform_lookup_msi(vdev);
>>> + if (!has_msi) {
>>> + request_module("vfio-msi:%s", vdev->compat);
>>> + has_msi = vfio_platform_lookup_msi(vdev);
>>> + }
>>> +
>>> + return has_msi ? 0 : -ENOENT;
>>> +}
>>> +
>>>  static void vfio_platform_put_reset(struct vfio_platform_device *vdev)
>>>  {
>>>   if (VFIO_PLATFORM_IS_ACPI(vdev))
>>> @@ -135,6 +170,12 @@ static void vfio_platform_put_reset(struct 
>>> vfio_platform_device *vdev)
>>>   module_put(vdev->reset_module);
>>>  }
>>>
>>> +static void vfio_platform_put_msi(struct vfio_platform_device *vdev)
>>> +{
>>> + if (vdev->of_get_msi)
>>> + module_put(vdev->msi_module);
>>> +}
>>> +
>>>  static int vfio_platform_regions_init(struct vfio_platform_device *vdev)
>>>  {
>>>   int cnt = 0, i;
>>> @@ -343,9 +384,17 @@ static long vfio_platform_ioctl(void *device_data,
>>>
>>>   } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
>>>   struct vfio_irq_info info;
>>> + struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
>>> + struct vfio_irq_info_cap_msi *msi_info = NULL;
>>> + int ext_irq_index = vdev->num_irqs - vdev->num_ext_irqs;
>&g

Re: [PATCH net-next 0/5] skbuff: introduce skbuff_heads bulking and reusing

2021-01-12 Thread Eric Dumazet
On Mon, Jan 11, 2021 at 7:27 PM Alexander Lobakin  wrote:
>
> Inspired by cpu_map_kthread_run() and _kfree_skb_defer() logics.
>
> Currently, all sorts of skb allocation always do allocate
> skbuff_heads one by one via kmem_cache_alloc().
> On the other hand, we have percpu napi_alloc_cache to store
> skbuff_heads queued up for freeing and flush them by bulks.
>
> We can use this struct to cache and bulk not only freeing, but also
> allocation of new skbuff_heads, as well as to reuse cached-to-free
> heads instead of allocating the new ones.
> As accessing napi_alloc_cache implies NAPI softirq context, do this
> only for __napi_alloc_skb() and its derivatives (napi_alloc_skb()
> and napi_get_frags()). The rough amount of their call sites are 69,
> which is quite a number.
>
> iperf3 showed a nice bump from 910 to 935 Mbits while performing
> UDP VLAN NAT on 1.2 GHz MIPS board. The boost is likely to be
> way bigger on more powerful hosts and NICs with tens of Mpps.

What is the latency cost of these bulk allocations, and for TCP traffic
on which GRO is the norm ?

Adding caches is increasing cache foot print when the cache is populated.

I wonder if your iperf3 numbers are simply wrong because of lack of
GRO in this UDP VLAN NAT case.

We are adding a log of additional code, thus icache pressure, that
iperf3 tests can not really measure.

Most linus devices simply handle one packet at a time (one packet per interrupt)


Re: [PATCH 2/2] scsi: ufs: Remove unnecessary devm_kfree

2021-01-11 Thread Eric Biggers
On Mon, Jan 11, 2021 at 11:32:02PM +0100, Bean Huo wrote:
> From: Bean Huo 
> 
> The memory allocated with devm_kzalloc() is freed automatically
> no need to explicitly call devm_kfree.
> 
> Signed-off-by: Bean Huo 
> ---
>  drivers/scsi/ufs/ufshcd-crypto.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd-crypto.c 
> b/drivers/scsi/ufs/ufshcd-crypto.c
> index 07310b12a5dc..ec80ec83cf85 100644
> --- a/drivers/scsi/ufs/ufshcd-crypto.c
> +++ b/drivers/scsi/ufs/ufshcd-crypto.c
> @@ -182,7 +182,7 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba 
> *hba)
>   err = blk_ksm_init(&hba->ksm,
>  hba->crypto_capabilities.config_count + 1);
>   if (err)
> - goto out_free_caps;
> + goto out;
>  
>   hba->ksm.ksm_ll_ops = ufshcd_ksm_ops;
>   /* UFS only supports 8 bytes for any DUN */
> @@ -208,8 +208,6 @@ int ufshcd_hba_init_crypto_capabilities(struct ufs_hba 
> *hba)
>  
>   return 0;
>  
> -out_free_caps:
> - devm_kfree(hba->dev, hba->crypto_cap_array);
>  out:
>   /* Indicate that init failed by clearing UFSHCD_CAP_CRYPTO */
>   hba->caps &= ~UFSHCD_CAP_CRYPTO;

Looks fine, feel free to add:

Reviewed-by: Eric Biggers 

I think this was here to free the memory in the case where the crypto support
gets disabled but the UFS host initialization still continues, so that the space
wouldn't be wasted.  But that's not what happens, as this is only reached on
ENOMEM which is a fatal error.

- Eric


Re: [PATCH v2 01/10] vfs: move cap_convert_nscap() call into vfs_setxattr()

2021-01-11 Thread Eric W. Biederman
Miklos Szeredi  writes:

> On Fri, Jan 01, 2021 at 11:35:16AM -0600, Eric W. Biederman wrote:
>> Miklos Szeredi  writes:
>> 
>> > cap_convert_nscap() does permission checking as well as conversion of the
>> > xattr value conditionally based on fs's user-ns.
>> >
>> > This is needed by overlayfs and probably other layered fs (ecryptfs) and is
>> > what vfs_foo() is supposed to do anyway.
>> 
>> Well crap.
>> 
>> I just noticed this and it turns out this change is wrong.
>> 
>> The problem is that it reads the rootid from the v3 fscap, using
>> current_user_ns() and then writes it using the sb->s_user_ns.
>> 
>> So any time the stacked filesystems sb->s_user_ns do not match or
>> current_user_ns does not match sb->s_user_ns this could be a problem.
>> 
>> In a stacked filesystem a second pass through vfs_setxattr will result
>> in the rootid being translated a second time (with potentially the wrong
>> namespaces).  I think because of the security checks a we won't write
>> something we shouldn't be able to write to the filesystem.  Still we
>> will be writing the wrong v3 fscap which can go quite badly.
>> 
>> This doesn't look terribly difficult to fix.
>> 
>> Probably convert this into a fs independent form using uids in
>> init_user_ns at input and have cap_convert_nscap convert the v3 fscap
>> into the filesystem dependent form.  With some way for stackable
>> filesystems to just skip converting it from the filesystem independent
>> format.
>> 
>> Uids in xattrs that are expected to go directly to disk, but aren't
>> always suitable for going directly to disk are tricky.
>
> I've been looking at this for a couple of days and can't say I clearly
> understand everything yet.
>
> For one: a v2 fscap is supposed to be equivalent to a v3 fscap with a rootid 
> of
> zero, right?

Yes.  This assumes that everything is translated into the uids of the
target filesystem.

> If so, why does cap_inode_getsecurity() treat them differently (v2 fscap
> succeeding unconditionally while v3 one being either converted to v2, rejected
> or left as v3 depending on current_user_ns())?

As I understand it v2 fscaps have always succeeded unconditionally.  The
only case I can see for a v2 fscap might not succeed when read is if the
filesystem is outside of the initial user namespace.


> Anyway, here's a patch that I think fixes getxattr() layering for
> security.capability.  Does basically what you suggested.  Slight change of
> semantics vs. v1 caps, not sure if that is still needed, getxattr()/setxattr()
> hasn't worked for these since the introduction of v3 in 4.14.
> Untested.

Taking a look.  The goal of change how these operate is to make it so
that layered filesystems can just pass through the data if they don't
want to change anything (even with the user namespaces of the
filesystems in question are different).

Feedback on the code below:
- cap_get should be in inode_operations like get_acl and set_acl.

- cap_get should return a cpu_vfs_cap_data.

  Which means that only make_kuid is needed when reading the cap from
  disk.

  Which means that except for the rootid_owns_currentns check (which
  needs to happen elsewhere) default_cap_get should be today's
  get_vfs_cap_from_disk.

- With the introduction of cap_get I believe commoncap should stop
  implementing the security_inode_getsecurity hook, and rather have
  getxattr observe is the file capability xatter and call the new
  vfs_cap_get then translate to a v2 or v3 cap as appropriate when
  returning the cap to userspace.

I think that would put the code on a solid comprehensible foundation.

Eric


> I still need to wrap my head around the permission requirements for the
> setxattr() case...
>
> Thanks,
> Miklos
>
> ---
>  fs/overlayfs/super.c   |   15 +++
>  include/linux/capability.h |2 
>  include/linux/fs.h |1 
>  security/commoncap.c   |  210 
> -
>  4 files changed, 132 insertions(+), 96 deletions(-)
>
> --- a/fs/overlayfs/super.c
> +++ b/fs/overlayfs/super.c
> @@ -395,6 +395,20 @@ static int ovl_remount(struct super_bloc
>   return ret;
>  }
>  
> +static int ovl_cap_get(struct dentry *dentry,
> +struct vfs_ns_cap_data *nscap)
> +{
> + int res;
> + const struct cred *old_cred;
> + struct dentry *realdentry = ovl_dentry_real(dentry);
> +
> + old_cred = ovl_override_creds(dentry->d_sb);
> + res = vfs_cap_get(realdentry, nscap);
> + revert_creds(old_cred);
> +
> + return res;
> +}
> +
>  static const struct 

Re: [RFC PATCH v2 0/8] Count rlimits in each user namespace

2021-01-11 Thread Eric W. Biederman
Linus Torvalds  writes:

> On Sun, Jan 10, 2021 at 9:34 AM Alexey Gladkov  
> wrote:
>>
>> To address the problem, we bind rlimit counters to each user namespace. The
>> result is a tree of rlimit counters with the biggest value at the root (aka
>> init_user_ns). The rlimit counter increment/decrement occurs in the current 
>> and
>> all parent user namespaces.
>
> I'm not seeing why this is necessary.
>
> Maybe it's the right approach, but none of the patches (or this cover
> letter email) really explain it to me.
>
> I understand why you might want the _limits_ themselves would form a
> tree like this - with the "master limit" limiting the limits in the
> user namespaces under it.
>
> But I don't understand why the _counts_ should do that. The 'struct
> user_struct' should be shared across even user namespaces for the same
> user.
>
> IOW, the very example of the problem you quote seems to argue against this:
>
>> For example, there are two containers (A and B) created by one user. The
>> container A sets RLIMIT_NPROC=1 and starts one process. Everything is fine, 
>> but
>> when container B tries to do the same it will fail because the number of
>> processes is counted globally for each user and user has one process already.
>
> Note how the problem was _not_ that the _count_ was global. That part
> was fine and all good.

The problem is fundamentally that the per process RLIMIT_NPROC was
compared against the user_struct->processes.

I have only heard the problem described but I believe it is either the
RLIMIT_NPROC test in fork or at the beginning of do_execveat_common that
is failing.

>From fs/exec.c line 1866:
>   /*
>* We move the actual failure in case of RLIMIT_NPROC excess from
>* set*uid() to execve() because too many poorly written programs
>* don't check setuid() return code.  Here we additionally recheck
>* whether NPROC limit is still exceeded.
>*/
>   if ((current->flags & PF_NPROC_EXCEEDED) &&
>   atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) {
>   retval = -EAGAIN;
>   goto out_ret;
>   }

>From fs/fork.c line 1966:
>   retval = -EAGAIN;
>   if (atomic_read(&p->real_cred->user->processes) >=
>   task_rlimit(p, RLIMIT_NPROC)) {
>   if (p->real_cred->user != INIT_USER &&
>   !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
>   goto bad_fork_free;
>   }
>   current->flags &= ~PF_NPROC_EXCEEDED;

In both the cases the RLIMIT_NPROC value comes from
task->signal->rlim[RLIMIT_NPROC] and the count of processes
comes from task->cred->user->processes.

> No, the problem was that the _limit_ in container A also ended up
> affecting container B.

The description I have is that both containers run the same service
that set it's RLIMIT_NPROC to 1 in both containers.

> So to me, that says that it would make sense to continue to use the
> resource counts in 'struct user_struct' (because if user A has a hard
> limit of X, then creating a new namespace shouldn't expand that
> limit), but then have the ability to make per-container changes to the
> resource limits (as long as they are within the bounds of the parent
> user namespace resource limit).

I agree that needs to work as well.

> Maybe there is some reason for this ucounts approach, but if so, I
> feel it was not explained at all.

Let me see if I can starte the example a litle more clearly.

Suppose there is a service never_fork that sets RLIMIT_NPROC runs as
never_fork_user and sets RLIMIT_NPROC to 1 in it's systemd service file.

Further suppose there is a user bob who has two containers he wants to
run: container1 and container2.  Both containers start the never_fork
service.

Bob first starts container1 and inside it the never_fork service starts.
Bob starts container2 and the never_fork service fails to start.

Does that make it clear that it is the count of the processes that would
exceed 1 if both instances of the never_fork service starts that would
be the problem?

Eric


Re: Malicious fs images was Re: ext4 regression in v5.9-rc2 from e7bfb5c9bb3d on ro fs with overlapped bitmaps

2021-01-11 Thread Eric Biggers
On Mon, Jan 11, 2021 at 10:51:20AM -0800, Darrick J. Wong wrote:
> On Sun, Jan 10, 2021 at 07:41:02PM +0100, Pavel Machek wrote:
> > Hi!
> > 
> > On Fri 2020-10-09 10:37:32, Theodore Y. Ts'o wrote:
> > > On Thu, Oct 08, 2020 at 03:22:59PM -0700, Josh Triplett wrote:
> > > > 
> > > > I wasn't trying to make a *new* general principle or policy. I was under
> > > > the impression that this *was* the policy, because it never occurred to
> > > > me that it could be otherwise. It seemed like a natural aspect of the
> > > > kernel/userspace boundary, to the point that the idea of it *not* being
> > > > part of the kernel's stability guarantees didn't cross my mind. 
> > > 
> > > >From our perspective (and Darrick and I discussed this on this week's
> > > ext4 video conference, so it represents the ext4 and xfs maintainer's
> > > position) is that the file system format is different.  First, the
> > > on-disk format is not an ABI, and it is several orders more complex
> > > than a system call interface.  Second, we make no guarantees about
> > > what the file system created by malicious tools will do.  For example,
> > > XFS developers reject bug reports from file system fuzzers, because
> 
> My recollection of this is quite different -- sybot was sending multiple
> zeroday exploits per week to the public xfs list, and nobody at Google
> was helping us to /fix/ those bugs.Each report took hours of developer
> time to extract the malicious image (because Dmitry couldn't figure out
> how to add that ability to syzbot) and syscall sequence from the
> reproducer program, plus whatever time it took to craft a patch, test
> it, and push it through review.
> 
> Dave and I complained to Dmitry about how the community had zero input
> into the rate at which syzbot was allowed to look for xfs bugs.  Nobody
> at Google would commit to helping fix any of the XFS bugs, and Dmitry
> would not give us better choices than (a) Google AI continuing to create
> zerodays and leaving the community to clean up the mess, or (b) shutting
> off syzbot entirely.  At the time I said I would accept letting syzbot
> run against xfs until it finds something, and turning it off until we
> resolve the issue.  That wasn't acceptable, because (I guess) nobody at
> Google wants to put /any/ staff time into XFS at all.
> 
> TLDR: XFS /does/ accept bug reports about fuzzed and broken images.
> What we don't want is make-work Google AIs spraying zeroday code in
> public places and the community developers having to clean up the mess.

syzkaller is an open source project that implements a coverage-guided fuzzer for
multiple operating system kernels; it's not "Google AI".

Anyone can run syzkaller (either by itself, or as part of a syzbot instance) and
find the same bugs.

- Eric


Re: Re: [PATCH] evm: Fix memleak in init_desc

2021-01-09 Thread Eric Biggers
On Sun, Jan 10, 2021 at 01:27:09PM +0800, dinghao@zju.edu.cn wrote:
> > On Sat, Jan 09, 2021 at 07:33:05PM +0800, Dinghao Liu wrote:
> > > When kmalloc() fails, tmp_tfm allocated by
> > > crypto_alloc_shash() has not been freed, which
> > > leads to memleak.
> > > 
> > > Fixes: d46eb3699502b ("evm: crypto hash replaced by shash")
> > > Signed-off-by: Dinghao Liu 
> > > ---
> > >  security/integrity/evm/evm_crypto.c | 9 +++--
> > >  1 file changed, 7 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/security/integrity/evm/evm_crypto.c 
> > > b/security/integrity/evm/evm_crypto.c
> > > index 168c3b78ac47..39fb31a638ac 100644
> > > --- a/security/integrity/evm/evm_crypto.c
> > > +++ b/security/integrity/evm/evm_crypto.c
> > > @@ -73,7 +73,7 @@ static struct shash_desc *init_desc(char type, uint8_t 
> > > hash_algo)
> > >  {
> > >   long rc;
> > >   const char *algo;
> > > - struct crypto_shash **tfm, *tmp_tfm;
> > > + struct crypto_shash **tfm, *tmp_tfm = NULL;
> > >   struct shash_desc *desc;
> > >  
> > >   if (type == EVM_XATTR_HMAC) {
> > > @@ -118,13 +118,18 @@ static struct shash_desc *init_desc(char type, 
> > > uint8_t hash_algo)
> > >  alloc:
> > >   desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(*tfm),
> > >   GFP_KERNEL);
> > > - if (!desc)
> > > + if (!desc) {
> > > + if (tmp_tfm)
> > > + crypto_free_shash(tmp_tfm);
> > >   return ERR_PTR(-ENOMEM);
> > > + }
> > >  
> > >   desc->tfm = *tfm;
> > >  
> > >   rc = crypto_shash_init(desc);
> > >   if (rc) {
> > > + if (tmp_tfm)
> > > + crypto_free_shash(tmp_tfm);
> > >   kfree(desc);
> > >   return ERR_PTR(rc);
> > >   }
> > 
> > There's no need to check for NULL before calling crypto_free_shash().
> > 
> 
> I find there is a crypto_shash_tfm() in the definition of 
> crypto_free_shash(). Will this lead to null pointer dereference
> when we use it to free a NULL pointer?
> 

No.  It does &tfm->base, not tfm->base.

- Eric


Re: [PATCH] x86/vm86/32: Remove VM86_SCREEN_BITMAP support

2021-01-09 Thread Eric W. Biederman
Andy Lutomirski  writes:

> The implementation was rather buggy.  It unconditionally marked PTEs
> read-only, even for VM_SHARED mappings.  I'm not sure whether this is
> actually a problem, but it certainly seems unwise.  More importantly, it
> released the mmap lock before flushing the TLB, which could allow a racing
> CoW operation to falsely believe that the underlying memory was not
> writable.
>
> I can't find any users at all of this mechanism, so just remove it.

In another age this was used by dosemu.  Have you looked at dosemu to
see if it still uses this support (on 32bit where dosemu can use vm86)?

It may still be a valid removal target I just wanted to point out what
the original user was.

Eric

> Cc: Andrea Arcangeli 
> Cc: Linux-MM 
> Cc: Jason Gunthorpe 
> Cc: x...@kernel.org
> Cc: Linus Torvalds 
> Cc: Matthew Wilcox 
> Cc: Jann Horn 
> Cc: Jan Kara 
> Cc: Yu Zhao 
> Cc: Peter Xu 
> Signed-off-by: Andy Lutomirski 
> ---
>  arch/x86/include/uapi/asm/vm86.h |  2 +-
>  arch/x86/kernel/vm86_32.c| 55 ++--
>  2 files changed, 10 insertions(+), 47 deletions(-)
>
> diff --git a/arch/x86/include/uapi/asm/vm86.h 
> b/arch/x86/include/uapi/asm/vm86.h
> index d2ee4e307ef8..50004fb4590d 100644
> --- a/arch/x86/include/uapi/asm/vm86.h
> +++ b/arch/x86/include/uapi/asm/vm86.h
> @@ -106,7 +106,7 @@ struct vm86_struct {
>  /*
>   * flags masks
>   */
> -#define VM86_SCREEN_BITMAP   0x0001
> +#define VM86_SCREEN_BITMAP   0x0001/* no longer supported */
>  
>  struct vm86plus_info_struct {
>   unsigned long force_return_for_pic:1;
> diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
> index 764573de3996..28b9e8d511e1 100644
> --- a/arch/x86/kernel/vm86_32.c
> +++ b/arch/x86/kernel/vm86_32.c
> @@ -160,49 +160,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int 
> retval)
>   do_exit(SIGSEGV);
>  }
>  
> -static void mark_screen_rdonly(struct mm_struct *mm)
> -{
> - struct vm_area_struct *vma;
> - spinlock_t *ptl;
> - pgd_t *pgd;
> - p4d_t *p4d;
> - pud_t *pud;
> - pmd_t *pmd;
> - pte_t *pte;
> - int i;
> -
> - mmap_write_lock(mm);
> - pgd = pgd_offset(mm, 0xA);
> - if (pgd_none_or_clear_bad(pgd))
> - goto out;
> - p4d = p4d_offset(pgd, 0xA);
> - if (p4d_none_or_clear_bad(p4d))
> - goto out;
> - pud = pud_offset(p4d, 0xA);
> - if (pud_none_or_clear_bad(pud))
> - goto out;
> - pmd = pmd_offset(pud, 0xA);
> -
> - if (pmd_trans_huge(*pmd)) {
> - vma = find_vma(mm, 0xA);
> - split_huge_pmd(vma, pmd, 0xA);
> - }
> - if (pmd_none_or_clear_bad(pmd))
> - goto out;
> - pte = pte_offset_map_lock(mm, pmd, 0xA, &ptl);
> - for (i = 0; i < 32; i++) {
> - if (pte_present(*pte))
> - set_pte(pte, pte_wrprotect(*pte));
> - pte++;
> - }
> - pte_unmap_unlock(pte, ptl);
> -out:
> - mmap_write_unlock(mm);
> - flush_tlb_mm_range(mm, 0xA, 0xA + 32*PAGE_SIZE, PAGE_SHIFT, 
> false);
> -}
> -
> -
> -
>  static int do_vm86_irq_handling(int subfunction, int irqnumber);
>  static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
>  
> @@ -282,6 +239,15 @@ static long do_sys_vm86(struct vm86plus_struct __user 
> *user_vm86, bool plus)
>   offsetof(struct vm86_struct, int_revectored)))
>   return -EFAULT;
>  
> +
> + /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */
> + if (v.flags & VM86_SCREEN_BITMAP) {
> + char comm[TASK_COMM_LEN];
> +
> + pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no 
> longer supported\n", get_task_comm(comm, current);
> + return -EINVAL;
> + }
> +
>   memset(&vm86regs, 0, sizeof(vm86regs));
>  
>   vm86regs.pt.bx = v.regs.ebx;
> @@ -370,9 +336,6 @@ static long do_sys_vm86(struct vm86plus_struct __user 
> *user_vm86, bool plus)
>   update_task_stack(tsk);
>   preempt_enable();
>  
> - if (vm86->flags & VM86_SCREEN_BITMAP)
> - mark_screen_rdonly(tsk->mm);
> -
>   memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
>   return regs->ax;
>  }


Re: KMSAN: uninit-value in __crypto_memneq (2)

2021-01-09 Thread Eric Biggers
+Jason, since this looks WireGuard-related.

On Sat, Jan 09, 2021 at 05:05:24AM -0800, syzbot wrote:
> Hello,
> 
> syzbot found the following issue on:
> 
> HEAD commit:73d62e81 kmsan: random: prevent boot-time reports in _mix_..
> git tree:   https://github.com/google/kmsan.git master
> console output: https://syzkaller.appspot.com/x/log.txt?x=142ab9c0d0
> kernel config:  https://syzkaller.appspot.com/x/.config?x=2cdf4151c9653e32
> dashboard link: https://syzkaller.appspot.com/bug?extid=e0f501056b282add58a6
> compiler:   clang version 11.0.0 
> (https://github.com/llvm/llvm-project.git 
> ca2dcbd030eadbf0aa9b660efe864ff08af6e18b)
> 
> Unfortunately, I don't have any reproducer for this issue yet.
> 
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+e0f501056b282add5...@syzkaller.appspotmail.com
> 
> =
> BUG: KMSAN: uninit-value in __crypto_memneq_16 crypto/memneq.c:99 [inline]
> BUG: KMSAN: uninit-value in __crypto_memneq+0x42c/0x470 crypto/memneq.c:161
> CPU: 0 PID: 20526 Comm: kworker/0:3 Not tainted 5.10.0-rc4-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 01/01/2011
> Workqueue: wg-crypt-wg1 wg_packet_decrypt_worker
> Call Trace:
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x21c/0x280 lib/dump_stack.c:118
>  kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:118
>  __msan_warning+0x5f/0xa0 mm/kmsan/kmsan_instr.c:197
>  __crypto_memneq_16 crypto/memneq.c:99 [inline]
>  __crypto_memneq+0x42c/0x470 crypto/memneq.c:161
>  crypto_memneq include/crypto/algapi.h:277 [inline]
>  chacha20poly1305_crypt_sg_inplace+0x1662/0x1cd0 
> lib/crypto/chacha20poly1305.c:311
>  chacha20poly1305_decrypt_sg_inplace+0x179/0x1d0 
> lib/crypto/chacha20poly1305.c:351
>  decrypt_packet drivers/net/wireguard/receive.c:284 [inline]
>  wg_packet_decrypt_worker+0x9cf/0x17d0 drivers/net/wireguard/receive.c:509
>  process_one_work+0x121c/0x1fc0 kernel/workqueue.c:2272
>  worker_thread+0x10cc/0x2740 kernel/workqueue.c:2418
>  kthread+0x51c/0x560 kernel/kthread.c:292
>  ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296
> 
> Uninit was stored to memory at:
>  kmsan_save_stack_with_flags mm/kmsan/kmsan.c:121 [inline]
>  kmsan_internal_chain_origin+0xad/0x130 mm/kmsan/kmsan.c:289
>  __msan_chain_origin+0x57/0xa0 mm/kmsan/kmsan_instr.c:147
>  put_unaligned_le64 include/linux/unaligned/access_ok.h:50 [inline]
>  poly1305_core_emit+0x625/0x6a0 lib/crypto/poly1305-donna64.c:182
>  poly1305_final_generic+0xe2/0x280 lib/crypto/poly1305.c:71
>  poly1305_final include/crypto/poly1305.h:94 [inline]
>  chacha20poly1305_crypt_sg_inplace+0x15cf/0x1cd0 
> lib/crypto/chacha20poly1305.c:310
>  chacha20poly1305_decrypt_sg_inplace+0x179/0x1d0 
> lib/crypto/chacha20poly1305.c:351
>  decrypt_packet drivers/net/wireguard/receive.c:284 [inline]
>  wg_packet_decrypt_worker+0x9cf/0x17d0 drivers/net/wireguard/receive.c:509
>  process_one_work+0x121c/0x1fc0 kernel/workqueue.c:2272
>  worker_thread+0x10cc/0x2740 kernel/workqueue.c:2418
>  kthread+0x51c/0x560 kernel/kthread.c:292
>  ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296
> 
> Uninit was stored to memory at:
>  kmsan_save_stack_with_flags mm/kmsan/kmsan.c:121 [inline]
>  kmsan_internal_chain_origin+0xad/0x130 mm/kmsan/kmsan.c:289
>  __msan_chain_origin+0x57/0xa0 mm/kmsan/kmsan_instr.c:147
>  poly1305_core_blocks+0x8f4/0x940 lib/crypto/poly1305-donna64.c:107
>  poly1305_update_generic+0x1a7/0x5a0 lib/crypto/poly1305.c:49
>  poly1305_update include/crypto/poly1305.h:83 [inline]
>  chacha20poly1305_crypt_sg_inplace+0x1496/0x1cd0 
> lib/crypto/chacha20poly1305.c:302
>  chacha20poly1305_decrypt_sg_inplace+0x179/0x1d0 
> lib/crypto/chacha20poly1305.c:351
>  decrypt_packet drivers/net/wireguard/receive.c:284 [inline]
>  wg_packet_decrypt_worker+0x9cf/0x17d0 drivers/net/wireguard/receive.c:509
>  process_one_work+0x121c/0x1fc0 kernel/workqueue.c:2272
>  worker_thread+0x10cc/0x2740 kernel/workqueue.c:2418
>  kthread+0x51c/0x560 kernel/kthread.c:292
>  ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296
> 
> Uninit was stored to memory at:
>  kmsan_save_stack_with_flags mm/kmsan/kmsan.c:121 [inline]
>  kmsan_internal_chain_origin+0xad/0x130 mm/kmsan/kmsan.c:289
>  __msan_chain_origin+0x57/0xa0 mm/kmsan/kmsan_instr.c:147
>  poly1305_core_blocks+0x8f4/0x940 lib/crypto/poly1305-donna64.c:107
>  poly1305_update_generic+0x1a7/0x5a0 lib/crypto/poly1305.c:49
>  poly1305_update include/crypto/poly1305.h:83 [inline]
>  chacha20poly1305_crypt_sg_inplace+0xb4d/0x1cd0 
> lib/crypto/chacha20poly1305.c:263
>  chacha20poly1305_decrypt_sg_inplace+0x179/0x1d0 
> lib/crypto/chacha20poly1305.c:351
>  decrypt_packet drivers/net/wireguard/receive.c:284 [inline]
>  wg_packet_decrypt_worker+0x9cf/0x17d0 drivers/net/wireguard/receive.c:509
>  process_one_work+0x121c/0x1fc0 kernel/workqueue.c:2272
>  worker_thread+0x10cc/0x2740 kernel/workqueue.c:2418
>

Re: [PATCH] evm: Fix memleak in init_desc

2021-01-09 Thread Eric Biggers
On Sat, Jan 09, 2021 at 07:33:05PM +0800, Dinghao Liu wrote:
> When kmalloc() fails, tmp_tfm allocated by
> crypto_alloc_shash() has not been freed, which
> leads to memleak.
> 
> Fixes: d46eb3699502b ("evm: crypto hash replaced by shash")
> Signed-off-by: Dinghao Liu 
> ---
>  security/integrity/evm/evm_crypto.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/security/integrity/evm/evm_crypto.c 
> b/security/integrity/evm/evm_crypto.c
> index 168c3b78ac47..39fb31a638ac 100644
> --- a/security/integrity/evm/evm_crypto.c
> +++ b/security/integrity/evm/evm_crypto.c
> @@ -73,7 +73,7 @@ static struct shash_desc *init_desc(char type, uint8_t 
> hash_algo)
>  {
>   long rc;
>   const char *algo;
> - struct crypto_shash **tfm, *tmp_tfm;
> + struct crypto_shash **tfm, *tmp_tfm = NULL;
>   struct shash_desc *desc;
>  
>   if (type == EVM_XATTR_HMAC) {
> @@ -118,13 +118,18 @@ static struct shash_desc *init_desc(char type, uint8_t 
> hash_algo)
>  alloc:
>   desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(*tfm),
>   GFP_KERNEL);
> - if (!desc)
> + if (!desc) {
> + if (tmp_tfm)
> + crypto_free_shash(tmp_tfm);
>   return ERR_PTR(-ENOMEM);
> + }
>  
>   desc->tfm = *tfm;
>  
>   rc = crypto_shash_init(desc);
>   if (rc) {
> + if (tmp_tfm)
> + crypto_free_shash(tmp_tfm);
>   kfree(desc);
>   return ERR_PTR(rc);
>   }

There's no need to check for NULL before calling crypto_free_shash().

- Eric


Re: Aarch64 EXT4FS inode checksum failures - seems to be weak memory ordering issues

2021-01-07 Thread Eric Biggers
On Thu, Jan 07, 2021 at 10:14:46PM +, Russell King - ARM Linux admin wrote:
> On Thu, Jan 07, 2021 at 10:48:05PM +0100, Arnd Bergmann wrote:
> > On Thu, Jan 7, 2021 at 5:27 PM Theodore Ts'o  wrote:
> > >
> > > On Thu, Jan 07, 2021 at 01:37:47PM +, Russell King - ARM Linux admin 
> > > wrote:
> > > > > The gcc bugzilla mentions backports into gcc-linaro, but I do not see
> > > > > them in my git history.
> > > >
> > > > So, do we raise the minimum gcc version for the kernel as a whole to 5.1
> > > > or just for aarch64?
> > >
> > > Russell, Arnd, thanks so much for tracking down the root cause of the
> > > bug!
> > 
> > There is one more thing that I wondered about when looking through
> > the ext4 code: Should it just call the crc32c_le() function directly
> > instead of going through the crypto layer? It seems that with Ard's
> > rework from 2018, that can just call the underlying architecture specific
> > implementation anyway.
> 
> Yes, I've been wondering about that too. To me, it looks like the
> ext4 code performs a layering violation by going "under the covers"
> - there are accessor functions to set the CRC and retrieve it. ext4
> instead just makes the assumption that the CRC value is stored after
> struct shash_desc. Especially as the crypto/crc32c code references
> the value using:
> 
>   struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
> 
> Not even crypto drivers are allowed to assume that desc+1 is where
> the CRC is stored.

It violates how the shash API is meant to be used in general, but there is a
test that enforces that the shash_desc_ctx for crc32c must be just the single
u32 crc value.  See alg_test_crc32c() in crypto/testmgr.c.  So it's apparently
intended to work.

> 
> However, struct shash_desc is already 128 bytes in size on aarch64,

Ard Biesheuvel recently sent a patch to reduce the alignment of struct
shash_desc to ARCH_SLAB_MINALIGN
(https://lkml.kernel.org/linux-crypto/20210107124128.19791-1-a...@kernel.org/),
since apparently most of the bloat is from alignment for DMA, which isn't
necessary.  I think that reduces the size by a lot on arm64.

> and the proper way of doing it via SHASH_DESC_ON_STACK() is overkill,
> being strangely 2 * sizeof(struct shash_desc) + 360 (which looks like
> another bug to me!)

Are you referring to the '2 * sizeof(struct shash_desc)' rather than just
'sizeof(struct shash_desc)'?  As mentioned in the comment above
HASH_MAX_DESCSIZE, there can be a nested shash_desc due to HMAC.
So I believe the value is correct.

> So, I agree with you wrt crc32c_le(), especially as it would be more
> efficient, and as the use of crc32c is already hard coded in the ext4
> code - not only with crypto_alloc_shash("crc32c", 0, 0) but also with
> the fixed-size structure in ext4_chksum().
> 
> However, it's ultimately up to the ext4 maintainers to decide.

As I mentioned in my other response, crc32c_le() isn't a proper library API
(like some of the newer lib/crypto/ stuff) but rather just a wrapper for the
shash API, and it doesn't handle modules being dynamically loaded/unloaded.
So switching to it may cause a performance regression.

What I'd recommend is making crc32c_le() able to call architecture-speccific
implementations directly, similar to blake2s() and chacha20() in lib/crypto/.
Then there would be no concern about when modules get loaded, etc...

- Eric


Re: Aarch64 EXT4FS inode checksum failures - seems to be weak memory ordering issues

2021-01-07 Thread Eric Biggers
On Thu, Jan 07, 2021 at 10:48:05PM +0100, Arnd Bergmann wrote:
> On Thu, Jan 7, 2021 at 5:27 PM Theodore Ts'o  wrote:
> >
> > On Thu, Jan 07, 2021 at 01:37:47PM +, Russell King - ARM Linux admin 
> > wrote:
> > > > The gcc bugzilla mentions backports into gcc-linaro, but I do not see
> > > > them in my git history.
> > >
> > > So, do we raise the minimum gcc version for the kernel as a whole to 5.1
> > > or just for aarch64?
> >
> > Russell, Arnd, thanks so much for tracking down the root cause of the
> > bug!
> 
> There is one more thing that I wondered about when looking through
> the ext4 code: Should it just call the crc32c_le() function directly
> instead of going through the crypto layer? It seems that with Ard's
> rework from 2018, that can just call the underlying architecture specific
> implementation anyway.
> 

It looks like that would work, although note that crc32c_le() uses the shash API
too, so it isn't any more "direct" than what ext4 does now.

Also, a potential issue is that the implementation of crc32c that crc32c_le()
uses might be chosen too early if the architecture-specific implementation of
crc32c is compiled as a module (e.g. crc32c-intel.ko).  There are two ways this
could be fixed -- either by making it a proper library API like blake2s() that
can call the architecture-specific code directly, or by reconfiguring things
when a new crypto module is loaded (like what lib/crc-t10dif.c does).

Until one of those is done, switching to crc32c_le() might cause performance
regressions.

- Eric


Re: [PATCH 3/5] crypto: add RFC5869 HKDF

2021-01-07 Thread Eric Biggers
On Thu, Jan 07, 2021 at 08:53:15AM +0100, Stephan Mueller wrote:
> > 
> > > RFC5869
> > > allows two optional parameters to be provided to the extract operation:
> > > the salt and additional information. Both are to be provided with the
> > > seed parameter where the salt is the first entry of the seed parameter
> > > and all subsequent entries are handled as additional information. If
> > > the caller intends to invoke the HKDF without salt, it has to provide a
> > > NULL/0 entry as first entry in seed.
> > 
> > Where does "additional information" for extract come from?  RFC 5869 has:
> > 
> > HKDF-Extract(salt, IKM) -> PRK
> > 
> > Inputs:
> >   salt optional salt value (a non-secret random value);
> >    if not provided, it is set to a string of HashLen
> > zeros.
> >   IKM  input keying material
> > 
> > There's no "additional information".
> 
> I used the terminology from SP800-108. I will update the description
> accordingly. 

For HKDF, it would be better to stick to the terminology used in RFC 5869
(https://tools.ietf.org/html/rfc5869), as generally that's what people are most
familiar with for HKDF.  It also matches the HKDF paper
(https://eprint.iacr.org/2010/264.pdf) more closely.

- Eric


Re: [PATCH 5/5] fs: use HKDF implementation from kernel crypto API

2021-01-07 Thread Eric Biggers
On Thu, Jan 07, 2021 at 08:49:52AM +0100, Stephan Mueller wrote:
> > > -int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
> > > +int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, u8 *master_key,
> > >   unsigned int master_key_size);
> > 
> > It shouldn't be necessary to remove const here.
> 
> Unfortunately it is when adding the pointer to struct kvec
> > 
> > >  
> > >  /*
> > > @@ -323,7 +323,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const
> > > u8 *master_key,
> > >  #define HKDF_CONTEXT_INODE_HASH_KEY7 /* info=   */
> > >  
> > >  int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
> > > -   const u8 *info, unsigned int infolen,
> > > +   u8 *info, unsigned int infolen,
> > > u8 *okm, unsigned int okmlen);
> > 
> > Likewise.  In fact some callers rely on 'info' not being modified.
> 
> Same here.

If the HKDF API will have a quirk like this, it's better not to "leak" it into
the prototypes of these fscrypt functions.  Just add the needed casts in
fscrypt_init_hkdf() and fscrypt_hkdf_expand().

> > > -   err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk));
> > > +   err = crypto_hkdf_setkey(hmac_tfm, seed, ARRAY_SIZE(seed));
> > > if (err)
> > > goto err_free_tfm;
> > 
> > It's weird that the salt and key have to be passed in a kvec.
> > Why not just have normal function parameters like:
> > 
> > int crypto_hkdf_setkey(struct crypto_shash *hmac_tfm,
> >    const u8 *key, size_t keysize,
> >    const u8 *salt, size_t saltsize);
> 
> I wanted to have an identical interface for all types of KDFs to allow turning
> them into a template eventually. For example, SP800-108 KDFs only have one
> parameter. Hence the use of a kvec.

But the API being provided is a library function specifically for HKDF.
So there's no need to make it conform to some other API.

- Eric


Re: [PATCH 3/5] crypto: add RFC5869 HKDF

2021-01-06 Thread Eric Biggers
On Mon, Jan 04, 2021 at 10:49:13PM +0100, Stephan Müller wrote:
> RFC5869 specifies an extract and expand two-step key derivation
> function. The HKDF implementation is provided as a service function that
> operates on a caller-provided HMAC cipher handle.

HMAC isn't a "cipher".

> The extract function is invoked via the crypto_hkdf_setkey call.

Any reason not to call this crypto_hkdf_extract(), to match the specification?

> RFC5869
> allows two optional parameters to be provided to the extract operation:
> the salt and additional information. Both are to be provided with the
> seed parameter where the salt is the first entry of the seed parameter
> and all subsequent entries are handled as additional information. If
> the caller intends to invoke the HKDF without salt, it has to provide a
> NULL/0 entry as first entry in seed.

Where does "additional information" for extract come from?  RFC 5869 has:

HKDF-Extract(salt, IKM) -> PRK

Inputs:
  salt optional salt value (a non-secret random value);
   if not provided, it is set to a string of HashLen zeros.
  IKM  input keying material

There's no "additional information".

> 
> The expand function is invoked via the crypto_hkdf_generate and can be
> invoked multiple times. This function allows the caller to provide a
> context for the key derivation operation. As specified in RFC5869, it is
> optional. In case such context is not provided, the caller must provide
> NULL / 0 for the info / info_nvec parameters.

Any reason not to call this crypto_hkdf_expand() to match the specification?

- Eric


Re: [PATCH 5/5] fs: use HKDF implementation from kernel crypto API

2021-01-06 Thread Eric Biggers
struct kvec info_iov[] = { {
> + .iov_base = "fscrypt\0",
> + .iov_len = 8,
> + }, {
> + .iov_base = &context,
> + .iov_len = 1,
> + }, {
> + .iov_base = info,
> + .iov_len = infolen,
> + } };
> + int err = crypto_hkdf_generate(hkdf->hmac_tfm,
> +info_iov, ARRAY_SIZE(info_iov),
> +okm, okmlen);
>  
> - err = crypto_shash_init(desc);
> - if (err)
> - goto out;
> -
> - if (prev) {
> - err = crypto_shash_update(desc, prev, HKDF_HASHLEN);
> - if (err)
> - goto out;
> - }
> -
> - err = crypto_shash_update(desc, prefix, sizeof(prefix));
> - if (err)
> - goto out;
> -
> - err = crypto_shash_update(desc, info, infolen);
> - if (err)
> - goto out;
> -
> - BUILD_BUG_ON(sizeof(counter) != 1);
> - if (okmlen - i < HKDF_HASHLEN) {
> - err = crypto_shash_finup(desc, &counter, 1, tmp);
> - if (err)
> - goto out;
> - memcpy(&okm[i], tmp, okmlen - i);
> - memzero_explicit(tmp, sizeof(tmp));
> - } else {
> - err = crypto_shash_finup(desc, &counter, 1, &okm[i]);
> - if (err)
> - goto out;
> - }
> - counter++;
> - prev = &okm[i];
> - }
> - err = 0;
> -out:
>   if (unlikely(err))
>   memzero_explicit(okm, okmlen); /* so caller doesn't need to */
> - shash_desc_zero(desc);

Shouldn't crypto_hkdf_generate() handle the above memzero_explicit() of the
output buffer on error, so that all callers don't need to do it?

- Eric


Re: [PATCH 0/5] Add KDF implementations to crypto API

2021-01-06 Thread Eric Biggers
On Wed, Jan 06, 2021 at 10:59:24PM -0800, Eric Biggers wrote:
> On Thu, Jan 07, 2021 at 07:37:05AM +0100, Stephan Mueller wrote:
> > Am Montag, dem 04.01.2021 um 14:20 -0800 schrieb Eric Biggers:
> > > On Mon, Jan 04, 2021 at 10:45:57PM +0100, Stephan Müller wrote:
> > > > The HKDF addition is used to replace the implementation in the 
> > > > filesystem
> > > > crypto extension. This code was tested by using an EXT4 encrypted file
> > > > system that was created and contains files written to by the current
> > > > implementation. Using the new implementation a successful read of the
> > > > existing files was possible and new files / directories were created
> > > > and read successfully. These newly added file system objects could be
> > > > successfully read using the current code. Yet if there is a test suite
> > > > to validate whether the invokcation of the HKDF calculates the same
> > > > result as the existing implementation, I would be happy to validate
> > > > the implementation accordingly.
> > > 
> > > See https://www.kernel.org/doc/html/latest/filesystems/fscrypt.html#tests
> > > for how to run the fscrypt tests.  'kvm-xfstests -c ext4 generic/582' 
> > > should
> > > be
> > > enough for this, though you could run all the tests if you want.
> > 
> > I ran the $(kvm-xfstests -c encrypt -g auto) on 5.11-rc2 with and without my
> > HKDF changes. I.e. the testing shows the same results for both kernels which
> > seems to imply that my HKDF changes do not change the behavior.
> > 
> > I get the following errors in both occasions - let me know if I should dig a
> > bit more.
> 
> The command you ran runs almost all xfstests with the test_dummy_encryption
> mount option enabled, which is different from running the encryption tests --
> and in fact it skips the real encryption tests, so it doesn't test the
> correctness of HKDF at all.  It looks like you saw some unrelated test 
> failures.
> Sorry if I wasn't clear -- by "all tests" I meant all encryption tests, i.e.
> 'kvm-xfstests -c ext4 -g encrypt'.  Also, even the single test generic/582
> should be sufficient to test HKDF, as I mentioned.
> 

I just did it myself and the tests pass.

- Eric


Re: [PATCH 0/5] Add KDF implementations to crypto API

2021-01-06 Thread Eric Biggers
On Thu, Jan 07, 2021 at 07:37:05AM +0100, Stephan Mueller wrote:
> Am Montag, dem 04.01.2021 um 14:20 -0800 schrieb Eric Biggers:
> > On Mon, Jan 04, 2021 at 10:45:57PM +0100, Stephan Müller wrote:
> > > The HKDF addition is used to replace the implementation in the filesystem
> > > crypto extension. This code was tested by using an EXT4 encrypted file
> > > system that was created and contains files written to by the current
> > > implementation. Using the new implementation a successful read of the
> > > existing files was possible and new files / directories were created
> > > and read successfully. These newly added file system objects could be
> > > successfully read using the current code. Yet if there is a test suite
> > > to validate whether the invokcation of the HKDF calculates the same
> > > result as the existing implementation, I would be happy to validate
> > > the implementation accordingly.
> > 
> > See https://www.kernel.org/doc/html/latest/filesystems/fscrypt.html#tests
> > for how to run the fscrypt tests.  'kvm-xfstests -c ext4 generic/582' should
> > be
> > enough for this, though you could run all the tests if you want.
> 
> I ran the $(kvm-xfstests -c encrypt -g auto) on 5.11-rc2 with and without my
> HKDF changes. I.e. the testing shows the same results for both kernels which
> seems to imply that my HKDF changes do not change the behavior.
> 
> I get the following errors in both occasions - let me know if I should dig a
> bit more.

The command you ran runs almost all xfstests with the test_dummy_encryption
mount option enabled, which is different from running the encryption tests --
and in fact it skips the real encryption tests, so it doesn't test the
correctness of HKDF at all.  It looks like you saw some unrelated test failures.
Sorry if I wasn't clear -- by "all tests" I meant all encryption tests, i.e.
'kvm-xfstests -c ext4 -g encrypt'.  Also, even the single test generic/582
should be sufficient to test HKDF, as I mentioned.

- Eric


Re: in_compat_syscall() on x86

2021-01-05 Thread Eric W. Biederman
Al Viro  writes:

> On Mon, Jan 04, 2021 at 06:47:38PM -0600, Eric W. Biederman wrote:
>> >> It is defined in the Ubuntu kernel configs I've got lurking:
>> >> Both 3.8.0-19_generic (Ubuntu 13.04) and 5.4.0-56_generic (probably 
>> >> 20.04).
>> >> Which is probably why it is in my test builds (I've just cut out
>> >> a lot of modules).
>> 
>> Interesting.  That sounds like something a gentle prod to the Ubuntu
>> kernel team might get them to disable.  Especially if there are not any
>> x32 binaries in sight.
>
> What for?

Any code that no one uses is better off disabled or deleted.

There are maintenance and support costs to such code as they cause extra
work when maintaining the kernel, and because the code is practically
never tested inevitably bugs some of which turn into security issues
slip through.

Maybe I am wrong and there are interesting users of x32.  All I remember
is that last time this was discussed someone found a distro that
actually shipped an x32 build to users.  Which was just enough users to
keep x32 alive.  Given that distros are in the process of dropping 32bit
support I suspect that distro may be going if it is not already gone.

There are a lot of weird x32 special cases that it would be nice to get
rid of if no one uses x32, and could certainly be made less of an issue
if distro's that don't actually care about x32 simply stopped compiling
it in.

>> The core dump code is currently tied to what binary you exec.
>> The code in exec sets mm->binfmt, and the coredump code uses mm->binfmt
>> to pick the coredump handler.
>> 
>> An x32 binary will make all kinds of 64bit calls where it doesn't need
>> the compat handling.  And of course x32 binaries run in 64bit mode with
>> 32bit pointers so looking at the current execution mode doesn't help.
>> 
>> Further fun compat_binfmt_elf is shared between x32 and ia32, because
>> except for a few stray places they do exactly the same thing.
>
> FWIW, there's a series cleaning that crap up nicely; as a side benefit,
> it converts both compats on mips (o32 and n32) to regular compat_binfmt_elf.c
> Yes, the current mainline is bloody awful in that area (PRSTATUS_SIZE and
> SET_PR_FPVALID are not for weak stomach), but that's really not hard to
> get into sane shape - -next had that done in last cycle and I'm currently
> testing (well, building the test kernel) of port of that to 5.11-rc1.

That does sound interesting.  Anytime we can clean up arch specific
weirdness so that it simply becomes generic weirdness and it can be
tested and maintained by more people is always nice.

> I really don't see the point of getting rid of x32 - mips n32 is *not*
> going away, and that's an exact parallel.

>From what I maintain x32 and n32 are *not* exact parallels.  The change
in size of the timestamps of SIGCHLD is not present on any other
architecture.

The truth is someone years ago royallying messed up struct siginfo and
took a structure that should have been the same on 32bit and 64bit and
would up with a horrible monstrosity of unions.

> PS: if anything, I wonder if we would better off with binfmt_elf{64,32}.o,
> built from fs/binfmt_elf.c; it's not that hard to do.  With arseloads
> of weirdness going away if we do that...

It sounds like we are already on our way to having that.  I suspect you
would need an binfmt_elf_ops to handle the architecture specific bits,
but it should not be too bad.

Eric


Re: in_compat_syscall() on x86

2021-01-04 Thread Eric W. Biederman
Andy Lutomirski  writes:

>> On Jan 4, 2021, at 2:36 PM, David Laight  wrote:
>> 
>> From: Eric W. Biederman
>>> Sent: 04 January 2021 20:41
>>> 
>>> Al Viro  writes:
>>> 
>>>> On Mon, Jan 04, 2021 at 12:16:56PM +, David Laight wrote:
>>>>> On x86 in_compat_syscall() is defined as:
>>>>>in_ia32_syscall() || in_x32_syscall()
>>>>> 
>>>>> Now in_ia32_syscall() is a simple check of the TS_COMPAT flag.
>>>>> However in_x32_syscall() is a horrid beast that has to indirect
>>>>> through to the original %eax value (ie the syscall number) and
>>>>> check for a bit there.
>>>>> 
>>>>> So on a kernel with x32 support (probably most distro kernels)
>>>>> the in_compat_syscall() check is rather more expensive than
>>>>> one might expect.
>>> 
>>> I suggest you check the distro kernels.  I suspect they don't compile in
>>> support for x32.  As far as I can tell x32 is an undead beast of a
>>> subarchitecture that just enough people use that it can't be removed,
>>> but few enough people use it likely has a few lurking scary bugs.
>> 
>> It is defined in the Ubuntu kernel configs I've got lurking:
>> Both 3.8.0-19_generic (Ubuntu 13.04) and 5.4.0-56_generic (probably 20.04).
>> Which is probably why it is in my test builds (I've just cut out
>> a lot of modules).

Interesting.  That sounds like something a gentle prod to the Ubuntu
kernel team might get them to disable.  Especially if there are not any
x32 binaries in sight.

Maybe Ubuntu has a reason or maybe someone just enabled the option
because it was there and they could.

>>>>> It would be muck better if both checks could be done together.
>>>>> I think this would require the syscall entry code to set a
>>>>> value in both the 64bit and x32 entry paths.
>>>>> (Can a process make both 64bit and x32 system calls?)
>>>> 
>>>> Yes, it bloody well can.
>>>> 
>>>> And I see no benefit in pushing that logics into syscall entry,
>>>> since anything that calls in_compat_syscall() more than once
>>>> per syscall execution is doing the wrong thing.  Moreover,
>>>> in quite a few cases we don't call the sucker at all, and for
>>>> all of those pushing that crap into syscall entry logics is
>>>> pure loss.
>>> 
>>> The x32 system calls have their own system call table and it would be
>>> trivial to set a flag like TS_COMPAT when looking up a system call from
>>> that table.  I expect such a change would be purely in the noise.
>> 
>> Certainly a write of 0/1/2 into a dirtied cache line of 'current'
>> could easily cost absolutely nothing.
>> Especially if current has already been read.
>> 
>> I also wondered about resetting it to zero when an x32 system call
>> exits (rather than entry to a 64bit one).
>> 
>> For ia32 the flag is set (with |=) on every syscall entry.
>> Even though I'm pretty sure it can only change during exec.
>
> It can change for every syscall. I have tests that do this.
>
>>>> What's the point, really?
>>> 
>>> Before we came up with the current games with __copy_siginfo_to_user
>>> and x32_copy_siginfo_to_user I was wondering if we should make such
>>> a change.  The delivery of compat signal frames and core dumps which
>>> do not go through the system call entry path could almost benefit from
>>> a flag that could be set/tested when on those paths.
>> 
>> For signal delivery it should (probably) depend on the system call
>> that setup the signal handler.
>
> I think it has worked this way for some time now.

It always has, but there is code that called as part of signal delivery
that needs to know if it is ia32 or x32 code (namely
copy_siginfo_to_user32).  The code paths are short enough we don't
strictly need the runtime test on that path and we have been able to
remove it, but it is an example of the kind of path that is not a
syscall entry where it would be nice to set the flag.

>> Although I'm sure I remember one kernel where some of it was done
>> in libc (with a single entrypoint for all hadlers).
>> 
>>> The fact that only SIGCHLD (which can not trigger a coredump) is
>>> different saves the coredump code from needing such a test.
>>> 
>>> The fact that the signal frame code is simple enough it can directly
>>> call x32_copy_siginfo_to_user or __copy_siginfo

Re: [PATCH 0/5] Add KDF implementations to crypto API

2021-01-04 Thread Eric Biggers
On Mon, Jan 04, 2021 at 10:45:57PM +0100, Stephan Müller wrote:
> The HKDF addition is used to replace the implementation in the filesystem
> crypto extension. This code was tested by using an EXT4 encrypted file
> system that was created and contains files written to by the current
> implementation. Using the new implementation a successful read of the
> existing files was possible and new files / directories were created
> and read successfully. These newly added file system objects could be
> successfully read using the current code. Yet if there is a test suite
> to validate whether the invokcation of the HKDF calculates the same
> result as the existing implementation, I would be happy to validate
> the implementation accordingly.

See https://www.kernel.org/doc/html/latest/filesystems/fscrypt.html#tests
for how to run the fscrypt tests.  'kvm-xfstests -c ext4 generic/582' should be
enough for this, though you could run all the tests if you want.

- Eric


Re: in_compat_syscall() on x86

2021-01-04 Thread Eric W. Biederman
Al Viro  writes:

> On Mon, Jan 04, 2021 at 12:16:56PM +, David Laight wrote:
>> On x86 in_compat_syscall() is defined as:
>> in_ia32_syscall() || in_x32_syscall()
>> 
>> Now in_ia32_syscall() is a simple check of the TS_COMPAT flag.
>> However in_x32_syscall() is a horrid beast that has to indirect
>> through to the original %eax value (ie the syscall number) and
>> check for a bit there.
>> 
>> So on a kernel with x32 support (probably most distro kernels)
>> the in_compat_syscall() check is rather more expensive than
>> one might expect.

I suggest you check the distro kernels.  I suspect they don't compile in
support for x32.  As far as I can tell x32 is an undead beast of a
subarchitecture that just enough people use that it can't be removed,
but few enough people use it likely has a few lurking scary bugs.

>> It would be muck better if both checks could be done together.
>> I think this would require the syscall entry code to set a
>> value in both the 64bit and x32 entry paths.
>> (Can a process make both 64bit and x32 system calls?)
>
> Yes, it bloody well can.
>
> And I see no benefit in pushing that logics into syscall entry,
> since anything that calls in_compat_syscall() more than once
> per syscall execution is doing the wrong thing.  Moreover,
> in quite a few cases we don't call the sucker at all, and for
> all of those pushing that crap into syscall entry logics is
> pure loss.

The x32 system calls have their own system call table and it would be
trivial to set a flag like TS_COMPAT when looking up a system call from
that table.  I expect such a change would be purely in the noise.

> What's the point, really?

Before we came up with the current games with __copy_siginfo_to_user
and x32_copy_siginfo_to_user I was wondering if we should make such
a change.  The delivery of compat signal frames and core dumps which
do not go through the system call entry path could almost benefit from
a flag that could be set/tested when on those paths.

The fact that only SIGCHLD (which can not trigger a coredump) is
different saves the coredump code from needing such a test.

The fact that the signal frame code is simple enough it can directly
call x32_copy_siginfo_to_user or __copy_siginfo_to_user saves us there.

So I don't think we have any cases where we actually need a flag that
is independent of the system call but we have come very close.


For people who want to optimize I suggest tracking down the handful of
users of x32 and see if x32 can be made to just go away.

Eric



Re: [PATCH] random: initialize ChaCha20 constants with correct endianness

2021-01-04 Thread Eric Biggers
On Fri, Nov 20, 2020 at 10:52:54AM -0800, Eric Biggers wrote:
> On Mon, Oct 26, 2020 at 09:33:54AM -0700, Eric Biggers wrote:
> > On Tue, Oct 06, 2020 at 08:51:45PM -0700, Eric Biggers wrote:
> > > On Fri, Sep 18, 2020 at 02:57:05PM -0700, Eric Biggers wrote:
> > > > On Fri, Sep 18, 2020 at 04:42:07PM -0400, Theodore Y. Ts'o wrote:
> > > 
> > > Ted, any further feedback on this?  Are you planning to apply this patch?
> > > 
> > > - Eric
> > 
> > Ping.
> 
> Ping.

Ping.


Re: [PATCH] random: remove dead code left over from blocking pool

2021-01-04 Thread Eric Biggers
On Fri, Nov 20, 2020 at 10:52:35AM -0800, Eric Biggers wrote:
> On Mon, Oct 26, 2020 at 09:34:03AM -0700, Eric Biggers wrote:
> > On Tue, Oct 06, 2020 at 08:50:58PM -0700, Eric Biggers wrote:
> > > On Tue, Sep 15, 2020 at 09:36:52PM -0700, Eric Biggers wrote:
> > > > From: Eric Biggers 
> > > > 
> > > > Remove some dead code that was left over following commit 90ea1c6436d2
> > > > ("random: remove the blocking pool").
> > > > 
> > > > Signed-off-by: Eric Biggers 
> > > 
> > > Ping?
> > 
> > Ping.
> 
> Ping.

Ping.


Re: [PATCH] random: fix the RNDRESEEDCRNG ioctl

2021-01-04 Thread Eric Biggers
On Fri, Nov 20, 2020 at 10:52:14AM -0800, Eric Biggers wrote:
> On Mon, Oct 26, 2020 at 09:33:43AM -0700, Eric Biggers wrote:
> > On Tue, Oct 06, 2020 at 08:50:21PM -0700, Eric Biggers wrote:
> > > On Tue, Sep 15, 2020 at 09:19:08PM -0700, Eric Biggers wrote:
> > > > From: Eric Biggers 
> > > > 
> > > > The RNDRESEEDCRNG ioctl reseeds the primary_crng from itself, which
> > > > doesn't make sense.  Reseed it from the input_pool instead.
> > > > 
> > > > Fixes: d848e5f8e1eb ("random: add new ioctl RNDRESEEDCRNG")
> > > > Cc: sta...@vger.kernel.org
> > > > Signed-off-by: Eric Biggers 
> > > 
> > > Ping?
> > 
> > Ping.
> 
> Ping.

Ping.


Re: [PATCH v3] vfs: don't unnecessarily clone write access for writable fds

2021-01-04 Thread Eric Biggers
On Tue, Sep 22, 2020 at 09:44:18AM -0700, Eric Biggers wrote:
> From: Eric Biggers 
> 
> There's no need for mnt_want_write_file() to increment mnt_writers when
> the file is already open for writing, provided that
> mnt_drop_write_file() is changed to conditionally decrement it.
> 
> We seem to have ended up in the current situation because
> mnt_want_write_file() used to be paired with mnt_drop_write(), due to
> mnt_drop_write_file() not having been added yet.  So originally
> mnt_want_write_file() had to always increment mnt_writers.
> 
> But later mnt_drop_write_file() was added, and all callers of
> mnt_want_write_file() were paired with it.  This makes the compatibility
> between mnt_want_write_file() and mnt_drop_write() no longer necessary.
> 
> Therefore, make __mnt_want_write_file() and __mnt_drop_write_file() skip
> incrementing mnt_writers on files already open for writing.  This
> removes the only caller of mnt_clone_write(), so remove that too.
> 
> Signed-off-by: Eric Biggers 
> ---
> 
> v3: added note to porting file.
> v2: keep the check for emergency r/o remounts.
> 
>  Documentation/filesystems/porting.rst |  7 
>  fs/namespace.c| 53 ++-
>  include/linux/mount.h |  1 -
>  3 files changed, 27 insertions(+), 34 deletions(-)

Ping.


[PATCH 2/2] Bluetooth: hci_h5: Add support for binding RTL8723DS with device tree

2021-01-02 Thread John-Eric Kamps
RTL8723DS could be handled by btrtl-driver, so add ability to bind it
using device tree.

Signed-off-by: John-Eric Kamps 
---
 drivers/bluetooth/hci_h5.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/bluetooth/hci_h5.c b/drivers/bluetooth/hci_h5.c
index 7be16a7f653b..fb9817f97d45 100644
--- a/drivers/bluetooth/hci_h5.c
+++ b/drivers/bluetooth/hci_h5.c
@@ -1022,6 +1022,8 @@ static const struct of_device_id rtl_bluetooth_of_match[] 
= {
  .data = (const void *)&rtl_vnd },
{ .compatible = "realtek,rtl8723bs-bt",
  .data = (const void *)&rtl_vnd },
+   { .compatible = "realtek,rtl8723ds-bt",
+ .data = (const void *)&rtl_vnd },
 #endif
{ },
 };
--
2.25.1



[PATCH 1/2] dt-bindings: net: bluetooth: Add rtl8723ds-bluetooth

2021-01-02 Thread John-Eric Kamps
Add binding documentation for bluetooth part of RTL8723DS

Signed-off-by: John-Eric Kamps 
---
 Documentation/devicetree/bindings/net/realtek-bluetooth.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/realtek-bluetooth.yaml 
b/Documentation/devicetree/bindings/net/realtek-bluetooth.yaml
index 4f485df69ac3..3cc21dc056e0 100644
--- a/Documentation/devicetree/bindings/net/realtek-bluetooth.yaml
+++ b/Documentation/devicetree/bindings/net/realtek-bluetooth.yaml
@@ -4,14 +4,14 @@
 $id: http://devicetree.org/schemas/net/realtek-bluetooth.yaml#
 $schema: http://devicetree.org/meta-schemas/core.yaml#

-title: RTL8723BS/RTL8723CS/RTL8822CS Bluetooth Device Tree Bindings
+title: RTL8723BS/RTL8723CS/RTL8723DS/RTL8822CS Bluetooth Device Tree Bindings

 maintainers:
   - Vasily Khoruzhick 
   - Alistair Francis 

 description:
-  RTL8723CS/RTL8723CS/RTL8822CS is WiFi + BT chip. WiFi part is connected over
+  RTL8723CS/RTL8723CS/RTL8723DS/RTL8822CS is WiFi + BT chip. WiFi part is 
connected over
   SDIO, while BT is connected over serial. It speaks H5 protocol with few
   extra commands to upload firmware and change module speed.

@@ -20,6 +20,7 @@ properties:
 oneOf:
   - const: "realtek,rtl8723bs-bt"
   - const: "realtek,rtl8723cs-bt"
+  - const: "realtek,rtl8723ds-bt"
   - const: "realtek,rtl8822cs-bt"

   device-wake-gpios:
--
2.25.1



Re: [PATCH v2 01/10] vfs: move cap_convert_nscap() call into vfs_setxattr()

2021-01-01 Thread Eric W. Biederman
Miklos Szeredi  writes:

> cap_convert_nscap() does permission checking as well as conversion of the
> xattr value conditionally based on fs's user-ns.
>
> This is needed by overlayfs and probably other layered fs (ecryptfs) and is
> what vfs_foo() is supposed to do anyway.

Well crap.

I just noticed this and it turns out this change is wrong.

The problem is that it reads the rootid from the v3 fscap, using
current_user_ns() and then writes it using the sb->s_user_ns.

So any time the stacked filesystems sb->s_user_ns do not match or
current_user_ns does not match sb->s_user_ns this could be a problem.

In a stacked filesystem a second pass through vfs_setxattr will result
in the rootid being translated a second time (with potentially the wrong
namespaces).  I think because of the security checks a we won't write
something we shouldn't be able to write to the filesystem.  Still we
will be writing the wrong v3 fscap which can go quite badly.

This doesn't look terribly difficult to fix.

Probably convert this into a fs independent form using uids in
init_user_ns at input and have cap_convert_nscap convert the v3 fscap
into the filesystem dependent form.  With some way for stackable
filesystems to just skip converting it from the filesystem independent
format.

Uids in xattrs that are expected to go directly to disk, but aren't
always suitable for going directly to disk are tricky.

Eric

> Signed-off-by: Miklos Szeredi 
> ---
>  fs/xattr.c | 17 +++--
>  include/linux/capability.h |  2 +-
>  security/commoncap.c   |  3 +--
>  3 files changed, 13 insertions(+), 9 deletions(-)
>
> diff --git a/fs/xattr.c b/fs/xattr.c
> index cd7a563e8bcd..fd57153b1f61 100644
> --- a/fs/xattr.c
> +++ b/fs/xattr.c
> @@ -276,8 +276,16 @@ vfs_setxattr(struct dentry *dentry, const char *name, 
> const void *value,
>  {
>   struct inode *inode = dentry->d_inode;
>   struct inode *delegated_inode = NULL;
> + const void  *orig_value = value;
>   int error;
>  
> + if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
> + error = cap_convert_nscap(dentry, &value, size);
> + if (error < 0)
> + return error;
> + size = error;
> + }
> +
>  retry_deleg:
>   inode_lock(inode);
>   error = __vfs_setxattr_locked(dentry, name, value, size, flags,
> @@ -289,6 +297,9 @@ vfs_setxattr(struct dentry *dentry, const char *name, 
> const void *value,
>   if (!error)
>   goto retry_deleg;
>   }
> + if (value != orig_value)
> + kfree(value);
> +
>   return error;
>  }
>  EXPORT_SYMBOL_GPL(vfs_setxattr);
> @@ -537,12 +548,6 @@ setxattr(struct dentry *d, const char __user *name, 
> const void __user *value,
>   if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
>   (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
>   posix_acl_fix_xattr_from_user(kvalue, size);
> - else if (strcmp(kname, XATTR_NAME_CAPS) == 0) {
> - error = cap_convert_nscap(d, &kvalue, size);
> - if (error < 0)
> - goto out;
> - size = error;
> - }
>   }
>  
>   error = vfs_setxattr(d, kname, kvalue, size, flags);
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index 1e7fe311cabe..b2f698915c0f 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -270,6 +270,6 @@ static inline bool checkpoint_restore_ns_capable(struct 
> user_namespace *ns)
>  /* audit system wants to get cap info from files as well */
>  extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
> cpu_vfs_cap_data *cpu_caps);
>  
> -extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
> size);
> +extern int cap_convert_nscap(struct dentry *dentry, const void **ivalue, 
> size_t size);
>  
>  #endif /* !_LINUX_CAPABILITY_H */
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 59bf3c1674c8..baccd871 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -473,7 +473,7 @@ static bool validheader(size_t size, const struct 
> vfs_cap_data *cap)
>   *
>   * If all is ok, we return the new size, on error return < 0.
>   */
> -int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
> +int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t 
> size)
>  {
>   struct vfs_ns_cap_data *nscap;
>   uid_t nsrootid;
> @@ -516,7 +516,6 @@ int cap_convert_nscap(struct dentry *dentry, void 
> **ivalue, size_t size)
>   nscap->magic_etc = cpu_to_le32(nsmagic);
>   memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
>  
> - kvfree(*ivalue);
>   *ivalue = nscap;
>   return newsize;
>  }


<    1   2   3   4   5   6   7   8   9   10   >