Re: [PATCH v3 4/5] KVM: arm64: selftests: get-reg-list: Remove get-reg-list-sve

2021-06-02 Thread Ricardo Koller
On Mon, May 31, 2021 at 12:33:43PM +0200, Andrew Jones wrote:
> Now that we can easily run the test for multiple vcpu configs, let's
> merge get-reg-list and get-reg-list-sve into just get-reg-list. We
> also add a final change to make it more possible to run multiple
> tests, which is to fork the test, rather than directly run it. That
> allows a test to fail, but subsequent tests can still run.
> 
> Signed-off-by: Andrew Jones 

Reviewed-by: Ricardo Koller 

> ---
>  tools/testing/selftests/kvm/.gitignore|  1 -
>  tools/testing/selftests/kvm/Makefile  |  1 -
>  .../selftests/kvm/aarch64/get-reg-list-sve.c  |  3 --
>  .../selftests/kvm/aarch64/get-reg-list.c  | 31 +--
>  4 files changed, 21 insertions(+), 15 deletions(-)
>  delete mode 100644 tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c
> 
> diff --git a/tools/testing/selftests/kvm/.gitignore 
> b/tools/testing/selftests/kvm/.gitignore
> index 524c857a049c..dd36575b732a 100644
> --- a/tools/testing/selftests/kvm/.gitignore
> +++ b/tools/testing/selftests/kvm/.gitignore
> @@ -1,6 +1,5 @@
>  # SPDX-License-Identifier: GPL-2.0-only
>  /aarch64/get-reg-list
> -/aarch64/get-reg-list-sve
>  /aarch64/vgic_init
>  /s390x/memop
>  /s390x/resets
> diff --git a/tools/testing/selftests/kvm/Makefile 
> b/tools/testing/selftests/kvm/Makefile
> index daaee1888b12..5c8f3725a7f0 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -79,7 +79,6 @@ TEST_GEN_PROGS_x86_64 += set_memory_region_test
>  TEST_GEN_PROGS_x86_64 += steal_time
>  
>  TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
> -TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
>  TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
>  TEST_GEN_PROGS_aarch64 += demand_paging_test
>  TEST_GEN_PROGS_aarch64 += dirty_log_test
> diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c 
> b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c
> deleted file mode 100644
> index efba76682b4b..
> --- a/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -// SPDX-License-Identifier: GPL-2.0
> -#define REG_LIST_SVE
> -#include "get-reg-list.c"
> diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c 
> b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> index 03e041d97a18..b46b8a1fdc0c 100644
> --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> @@ -27,16 +27,13 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
> +#include 
>  #include "kvm_util.h"
>  #include "test_util.h"
>  #include "processor.h"
>  
> -#ifdef REG_LIST_SVE
> -#define reg_list_sve() (true)
> -#else
> -#define reg_list_sve() (false)
> -#endif
> -
>  static struct kvm_reg_list *reg_list;
>  static __u64 *blessed_reg, blessed_n;
>  
> @@ -588,7 +585,8 @@ static struct vcpu_config *parse_config(const char 
> *config)
>  int main(int ac, char **av)
>  {
>   struct vcpu_config *c, *sel = NULL;
> - int i;
> + int i, ret = 0;
> + pid_t pid;
>  
>   for (i = 1; i < ac; ++i) {
>   if (strcmp(av[i], "--core-reg-fixup") == 0)
> @@ -617,10 +615,22 @@ int main(int ac, char **av)
>   c = vcpu_configs[i];
>   if (sel && c != sel)
>   continue;
> - run_test(c);
> +
> + pid = fork();
> +
> + if (!pid) {
> + run_test(c);
> + exit(0);
> + } else {
> + int wstatus;
> + pid_t wpid = wait(&wstatus);
> + TEST_ASSERT(wpid == pid && WIFEXITED(wstatus), "wait: 
> Unexpected return");
> + if (WEXITSTATUS(wstatus) && WEXITSTATUS(wstatus) != 
> KSFT_SKIP)
> + ret = KSFT_FAIL;
> + }
>   }
>  
> - return 0;
> + return ret;
>  }
>  
>  /*
> @@ -1026,6 +1036,7 @@ static struct vcpu_config sve_config = {
>  };
>  
>  static struct vcpu_config *vcpu_configs[] = {
> - reg_list_sve() ? &sve_config : &vregs_config,
> + &vregs_config,
> + &sve_config,
>  };
>  static int vcpu_configs_n = ARRAY_SIZE(vcpu_configs);
> -- 
> 2.31.1
> 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 3/5] KVM: arm64: selftests: get-reg-list: Provide config selection option

2021-06-02 Thread Ricardo Koller
On Mon, May 31, 2021 at 12:33:42PM +0200, Andrew Jones wrote:
> Add a new command line option that allows the user to select a specific
> configuration, e.g. --config=sve will give the sve config. Also provide
> help text and the --help/-h options.
> 
> Signed-off-by: Andrew Jones 

Reviewed-by: Ricardo Koller 

> ---
>  .../selftests/kvm/aarch64/get-reg-list.c  | 56 ++-
>  1 file changed, 53 insertions(+), 3 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c 
> b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> index 14fc8d82e30f..03e041d97a18 100644
> --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> @@ -539,6 +539,52 @@ static void run_test(struct vcpu_config *c)
>   kvm_vm_free(vm);
>  }
>  
> +static void help(void)
> +{
> + struct vcpu_config *c;
> + int i;
> +
> + printf(
> + "\n"
> + "usage: get-reg-list [--config=] [--list] [--list-filtered] 
> [--core-reg-fixup]\n\n"
> + " --config=Used to select a specific vcpu 
> configuration for the test/listing\n"
> + " '' may be\n");
> +
> + for (i = 0; i < vcpu_configs_n; ++i) {
> + c = vcpu_configs[i];
> + printf(
> + "   '%s'\n", config_name(c));
> + }
> +
> + printf(
> + "\n"
> + " --list  Print the register list rather than test 
> it (requires --config)\n"
> + " --list-filtered Print registers that would normally be 
> filtered out (requires --config)\n"
> + " --core-reg-fixupNeeded when running on old kernels with 
> broken core reg listings\n"
> + "\n"
> + );
> +}
> +
> +static struct vcpu_config *parse_config(const char *config)
> +{
> + struct vcpu_config *c;
> + int i;
> +
> + if (config[8] != '=')
> + help(), exit(1);
> +
> + for (i = 0; i < vcpu_configs_n; ++i) {
> + c = vcpu_configs[i];
> + if (strcmp(config_name(c), &config[9]) == 0)
> + break;
> + }
> +
> + if (i == vcpu_configs_n)
> + help(), exit(1);
> +
> + return c;
> +}
> +
>  int main(int ac, char **av)
>  {
>   struct vcpu_config *c, *sel = NULL;
> @@ -547,20 +593,24 @@ int main(int ac, char **av)
>   for (i = 1; i < ac; ++i) {
>   if (strcmp(av[i], "--core-reg-fixup") == 0)
>   fixup_core_regs = true;
> + else if (strncmp(av[i], "--config", 8) == 0)
> + sel = parse_config(av[i]);
>   else if (strcmp(av[i], "--list") == 0)
>   print_list = true;
>   else if (strcmp(av[i], "--list-filtered") == 0)
>   print_filtered = true;
> + else if (strcmp(av[i], "--help") == 0 || strcmp(av[1], "-h") == 
> 0)
> + help(), exit(0);
>   else
> - TEST_FAIL("Unknown option: %s\n", av[i]);
> + help(), exit(1);
>   }
>  
>   if (print_list || print_filtered) {
>   /*
>* We only want to print the register list of a single config.
> -  * TODO: Add command line support to pick which config.
>*/
> - sel = vcpu_configs[0];
> + if (!sel)
> + help(), exit(1);
>   }
>  
>   for (i = 0; i < vcpu_configs_n; ++i) {
> -- 
> 2.31.1
> 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 2/5] KVM: arm64: selftests: get-reg-list: Prepare to run multiple configs at once

2021-06-02 Thread Ricardo Koller
On Mon, May 31, 2021 at 12:33:41PM +0200, Andrew Jones wrote:
> We don't want to have to create a new binary for each vcpu config, so
> prepare to run the test for multiple vcpu configs in a single binary.
> We do this by factoring out the test from main() and then looping over
> configs. When given '--list' we still never print more than a single
> reg-list for a single vcpu config though, because it would be confusing
> otherwise.
> 
> No functional change intended.
> 
> Signed-off-by: Andrew Jones 

Reviewed-by: Ricardo Koller 

> ---
>  .../selftests/kvm/aarch64/get-reg-list.c  | 68 ++-
>  1 file changed, 51 insertions(+), 17 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c 
> b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> index 7bb09ce20dde..14fc8d82e30f 100644
> --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> @@ -56,8 +56,8 @@ struct vcpu_config {
>   struct reg_sublist sublists[];
>  };
>  
> -static struct vcpu_config vregs_config;
> -static struct vcpu_config sve_config;
> +static struct vcpu_config *vcpu_configs[];
> +static int vcpu_configs_n;
>  
>  #define for_each_sublist(c, s)   
> \
>   for ((s) = &(c)->sublists[0]; (s)->regs; ++(s))
> @@ -400,29 +400,20 @@ static void check_supported(struct vcpu_config *c)
>   }
>  }
>  
> -int main(int ac, char **av)
> +static bool print_list;
> +static bool print_filtered;
> +static bool fixup_core_regs;
> +
> +static void run_test(struct vcpu_config *c)
>  {
> - struct vcpu_config *c = reg_list_sve() ? &sve_config : &vregs_config;
>   struct kvm_vcpu_init init = { .target = -1, };
>   int new_regs = 0, missing_regs = 0, i, n;
>   int failed_get = 0, failed_set = 0, failed_reject = 0;
> - bool print_list = false, print_filtered = false, fixup_core_regs = 
> false;
>   struct kvm_vm *vm;
>   struct reg_sublist *s;
>  
>   check_supported(c);
>  
> - for (i = 1; i < ac; ++i) {
> - if (strcmp(av[i], "--core-reg-fixup") == 0)
> - fixup_core_regs = true;
> - else if (strcmp(av[i], "--list") == 0)
> - print_list = true;
> - else if (strcmp(av[i], "--list-filtered") == 0)
> - print_filtered = true;
> - else
> - TEST_FAIL("Unknown option: %s\n", av[i]);
> - }
> -
>   vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
>   prepare_vcpu_init(c, &init);
>   aarch64_vcpu_add_default(vm, 0, &init, NULL);
> @@ -442,7 +433,7 @@ int main(int ac, char **av)
>   print_reg(c, id);
>   }
>   putchar('\n');
> - return 0;
> + return;
>   }
>  
>   /*
> @@ -541,6 +532,44 @@ int main(int ac, char **av)
>   "%d registers failed get; %d registers failed set; %d 
> registers failed reject",
>   config_name(c), missing_regs, failed_get, failed_set, 
> failed_reject);
>  
> + pr_info("%s: PASS\n", config_name(c));
> + blessed_n = 0;
> + free(blessed_reg);
> + free(reg_list);
> + kvm_vm_free(vm);
> +}
> +
> +int main(int ac, char **av)
> +{
> + struct vcpu_config *c, *sel = NULL;
> + int i;
> +
> + for (i = 1; i < ac; ++i) {
> + if (strcmp(av[i], "--core-reg-fixup") == 0)
> + fixup_core_regs = true;
> + else if (strcmp(av[i], "--list") == 0)
> + print_list = true;
> + else if (strcmp(av[i], "--list-filtered") == 0)
> + print_filtered = true;
> + else
> + TEST_FAIL("Unknown option: %s\n", av[i]);
> + }
> +
> + if (print_list || print_filtered) {
> + /*
> +  * We only want to print the register list of a single config.
> +  * TODO: Add command line support to pick which config.
> +  */
> + sel = vcpu_configs[0];
> + }
> +
> + for (i = 0; i < vcpu_configs_n; ++i) {
> + c = vcpu_configs[i];
> + if (sel && c != sel)
> + continue;
> + run_test(c);
> + }
> +
>   return 0;
>  }
>  
> @@ -945,3 +974,8 @@ static struct vcpu_config sve_config = {
>   {0},
>   },
>  };
> +
> +static struct vcpu_config *vcpu_configs[] = {
> + reg_list_sve() ? &sve_config : &vregs_config,
> +};
> +static int vcpu_configs_n = ARRAY_SIZE(vcpu_configs);
> -- 
> 2.31.1
> 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v3 1/5] KVM: arm64: selftests: get-reg-list: Introduce vcpu configs

2021-06-02 Thread Ricardo Koller
On Mon, May 31, 2021 at 12:33:40PM +0200, Andrew Jones wrote:
> We already break register lists into sublists that get selected based
> on vcpu config. However, since we only had two configs (vregs and sve),
> we didn't structure the code very well to manage them. Restructure it
> now to more cleanly handle register sublists that are dependent on the
> vcpu config.
> 
> This patch has no intended functional change (except for the vcpu
> config name now being prepended to all output).
> 
> Signed-off-by: Andrew Jones 

Reviewed-by: Ricardo Koller 

> ---
>  .../selftests/kvm/aarch64/get-reg-list.c  | 265 --
>  1 file changed, 175 insertions(+), 90 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c 
> b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> index 486932164cf2..7bb09ce20dde 100644
> --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> @@ -37,7 +37,30 @@
>  #define reg_list_sve() (false)
>  #endif
>  
> -#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | 
> KVM_REG_ARM_COPROC_MASK)
> +static struct kvm_reg_list *reg_list;
> +static __u64 *blessed_reg, blessed_n;
> +
> +struct reg_sublist {
> + const char *name;
> + long capability;
> + int feature;
> + bool finalize;
> + __u64 *regs;
> + __u64 regs_n;
> + __u64 *rejects_set;
> + __u64 rejects_set_n;
> +};
> +
> +struct vcpu_config {
> + char *name;
> + struct reg_sublist sublists[];
> +};
> +
> +static struct vcpu_config vregs_config;
> +static struct vcpu_config sve_config;
> +
> +#define for_each_sublist(c, s)   
> \
> + for ((s) = &(c)->sublists[0]; (s)->regs; ++(s))
>  
>  #define for_each_reg(i)  
> \
>   for ((i) = 0; (i) < reg_list->n; ++(i))
> @@ -54,12 +77,41 @@
>   for_each_reg_filtered(i)
> \
>   if (!find_reg(blessed_reg, blessed_n, reg_list->reg[i]))
>  
> +static const char *config_name(struct vcpu_config *c)
> +{
> + struct reg_sublist *s;
> + int len = 0;
>  
> -static struct kvm_reg_list *reg_list;
> + if (c->name)
> + return c->name;
>  
> -static __u64 base_regs[], vregs[], sve_regs[], rejects_set[];
> -static __u64 base_regs_n, vregs_n, sve_regs_n, rejects_set_n;
> -static __u64 *blessed_reg, blessed_n;
> + for_each_sublist(c, s)
> + len += strlen(s->name) + 1;
> +
> + c->name = malloc(len);
> +
> + len = 0;
> + for_each_sublist(c, s) {
> + if (!strcmp(s->name, "base"))
> + continue;
> + strcat(c->name + len, s->name);
> + len += strlen(s->name) + 1;
> + c->name[len - 1] = '+';
> + }
> + c->name[len - 1] = '\0';
> +
> + return c->name;
> +}
> +
> +static bool has_cap(struct vcpu_config *c, long capability)
> +{
> + struct reg_sublist *s;
> +
> + for_each_sublist(c, s)
> + if (s->capability == capability)
> + return true;
> + return false;
> +}
>  
>  static bool filter_reg(__u64 reg)
>  {
> @@ -96,11 +148,13 @@ static const char *str_with_index(const char *template, 
> __u64 index)
>   return (const char *)str;
>  }
>  
> +#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | 
> KVM_REG_ARM_COPROC_MASK)
> +
>  #define CORE_REGS_XX_NR_WORDS2
>  #define CORE_SPSR_XX_NR_WORDS2
>  #define CORE_FPREGS_XX_NR_WORDS  4
>  
> -static const char *core_id_to_str(__u64 id)
> +static const char *core_id_to_str(struct vcpu_config *c, __u64 id)
>  {
>   __u64 core_off = id & ~REG_MASK, idx;
>  
> @@ -111,7 +165,7 @@ static const char *core_id_to_str(__u64 id)
>   case KVM_REG_ARM_CORE_REG(regs.regs[0]) ...
>KVM_REG_ARM_CORE_REG(regs.regs[30]):
>   idx = (core_off - KVM_REG_ARM_CORE_REG(regs.regs[0])) / 
> CORE_REGS_XX_NR_WORDS;
> - TEST_ASSERT(idx < 31, "Unexpected regs.regs index: %lld", idx);
> + TEST_ASSERT(idx < 31, "%s: Unexpected regs.regs index: %lld", 
> config_name(c), idx);
>   return str_with_index("KVM_REG_ARM_CORE_REG(regs.regs[##])", 
> idx);
>   case KVM_REG_ARM_CORE_REG(regs.sp):
>   return "KVM_REG_ARM_CORE_REG(regs.sp)";
> @@ -126,12 +180,12 @@ static const char *core_id_to_str(__u64 id)
>   case KVM_REG_ARM_CORE_REG(spsr[0]) ...
>KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]):
>   idx = (core_off - KVM_REG_ARM_CORE_REG(spsr[0])) / 
> CORE_SPSR_XX_NR_WORDS;
> - TEST_ASSERT(idx < KVM_NR_SPSR, "Unexpected spsr index: %lld", 
> idx);
> + TEST_ASSERT(idx < KVM_NR_SPSR, "%s: Unexpected spsr index: 
> %lld", config_name(c), idx);
>   return str_with_index("KVM_REG_ARM_CORE_REG(spsr[##])", idx);
>   case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) 

Re: [PATCH v3 5/5] KVM: arm64: selftests: get-reg-list: Split base and pmu registers

2021-06-02 Thread Ricardo Koller
On Mon, May 31, 2021 at 12:33:44PM +0200, Andrew Jones wrote:
> Since KVM commit 11663111cd49 ("KVM: arm64: Hide PMU registers from
> userspace when not available") the get-reg-list* tests have been
> failing with
> 
>   ...
>   ... There are 74 missing registers.
>   The following lines are missing registers:
>   ...
> 
> where the 74 missing registers are all PMU registers. This isn't a
> bug in KVM that the selftest found, even though it's true that a
> KVM userspace that wasn't setting the KVM_ARM_VCPU_PMU_V3 VCPU
> flag, but still expecting the PMU registers to be in the reg-list,
> would suddenly no longer have their expectations met. In that case,
> the expectations were wrong, though, so that KVM userspace needs to
> be fixed, and so does this selftest. The fix for this selftest is to
> pull the PMU registers out of the base register sublist into their
> own sublist and then create new, pmu-enabled vcpu configs which can
> be tested.
> 
> Signed-off-by: Andrew Jones 

Reviewed-by: Ricardo Koller 

> ---
>  .../selftests/kvm/aarch64/get-reg-list.c  | 39 +++
>  1 file changed, 31 insertions(+), 8 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c 
> b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> index b46b8a1fdc0c..a16c8f05366c 100644
> --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
> @@ -637,7 +637,7 @@ int main(int ac, char **av)
>   * The current blessed list was primed with the output of kernel version
>   * v4.15 with --core-reg-fixup and then later updated with new registers.
>   *
> - * The blessed list is up to date with kernel version v5.10-rc5
> + * The blessed list is up to date with kernel version v5.13-rc3
>   */
>  static __u64 base_regs[] = {
>   KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | 
> KVM_REG_ARM_CORE_REG(regs.regs[0]),
> @@ -829,8 +829,6 @@ static __u64 base_regs[] = {
>   ARM64_SYS_REG(3, 0, 5, 2, 0),   /* ESR_EL1 */
>   ARM64_SYS_REG(3, 0, 6, 0, 0),   /* FAR_EL1 */
>   ARM64_SYS_REG(3, 0, 7, 4, 0),   /* PAR_EL1 */
> - ARM64_SYS_REG(3, 0, 9, 14, 1),  /* PMINTENSET_EL1 */
> - ARM64_SYS_REG(3, 0, 9, 14, 2),  /* PMINTENCLR_EL1 */
>   ARM64_SYS_REG(3, 0, 10, 2, 0),  /* MAIR_EL1 */
>   ARM64_SYS_REG(3, 0, 10, 3, 0),  /* AMAIR_EL1 */
>   ARM64_SYS_REG(3, 0, 12, 0, 0),  /* VBAR_EL1 */
> @@ -839,6 +837,16 @@ static __u64 base_regs[] = {
>   ARM64_SYS_REG(3, 0, 13, 0, 4),  /* TPIDR_EL1 */
>   ARM64_SYS_REG(3, 0, 14, 1, 0),  /* CNTKCTL_EL1 */
>   ARM64_SYS_REG(3, 2, 0, 0, 0),   /* CSSELR_EL1 */
> + ARM64_SYS_REG(3, 3, 13, 0, 2),  /* TPIDR_EL0 */
> + ARM64_SYS_REG(3, 3, 13, 0, 3),  /* TPIDRRO_EL0 */
> + ARM64_SYS_REG(3, 4, 3, 0, 0),   /* DACR32_EL2 */
> + ARM64_SYS_REG(3, 4, 5, 0, 1),   /* IFSR32_EL2 */
> + ARM64_SYS_REG(3, 4, 5, 3, 0),   /* FPEXC32_EL2 */
> +};
> +
> +static __u64 pmu_regs[] = {
> + ARM64_SYS_REG(3, 0, 9, 14, 1),  /* PMINTENSET_EL1 */
> + ARM64_SYS_REG(3, 0, 9, 14, 2),  /* PMINTENCLR_EL1 */
>   ARM64_SYS_REG(3, 3, 9, 12, 0),  /* PMCR_EL0 */
>   ARM64_SYS_REG(3, 3, 9, 12, 1),  /* PMCNTENSET_EL0 */
>   ARM64_SYS_REG(3, 3, 9, 12, 2),  /* PMCNTENCLR_EL0 */
> @@ -848,8 +856,6 @@ static __u64 base_regs[] = {
>   ARM64_SYS_REG(3, 3, 9, 13, 0),  /* PMCCNTR_EL0 */
>   ARM64_SYS_REG(3, 3, 9, 14, 0),  /* PMUSERENR_EL0 */
>   ARM64_SYS_REG(3, 3, 9, 14, 3),  /* PMOVSSET_EL0 */
> - ARM64_SYS_REG(3, 3, 13, 0, 2),  /* TPIDR_EL0 */
> - ARM64_SYS_REG(3, 3, 13, 0, 3),  /* TPIDRRO_EL0 */
>   ARM64_SYS_REG(3, 3, 14, 8, 0),
>   ARM64_SYS_REG(3, 3, 14, 8, 1),
>   ARM64_SYS_REG(3, 3, 14, 8, 2),
> @@ -913,9 +919,6 @@ static __u64 base_regs[] = {
>   ARM64_SYS_REG(3, 3, 14, 15, 5),
>   ARM64_SYS_REG(3, 3, 14, 15, 6),
>   ARM64_SYS_REG(3, 3, 14, 15, 7), /* PMCCFILTR_EL0 */
> - ARM64_SYS_REG(3, 4, 3, 0, 0),   /* DACR32_EL2 */
> - ARM64_SYS_REG(3, 4, 5, 0, 1),   /* IFSR32_EL2 */
> - ARM64_SYS_REG(3, 4, 5, 3, 0),   /* FPEXC32_EL2 */
>  };
>  
>  static __u64 vregs[] = {
> @@ -1015,6 +1018,8 @@ static __u64 sve_rejects_set[] = {
>   { "base", .regs = base_regs, .regs_n = ARRAY_SIZE(base_regs), }
>  #define VREGS_SUBLIST \
>   { "vregs", .regs = vregs, .regs_n = ARRAY_SIZE(vregs), }
> +#define PMU_SUBLIST \
> + { "pmu", .regs = pmu_regs, .regs_n = ARRAY_SIZE(pmu_regs), }
>  #define SVE_SUBLIST \
>   { "sve", .capability = KVM_CAP_ARM_SVE, .feature = KVM_ARM_VCPU_SVE, 
> .finalize = true, \
> .regs = sve_regs, .regs_n = ARRAY_SIZE(sve_regs), \
> @@ -1027,6 +1032,14 @@ static struct vcpu_config vregs_config = {
>   {0},
>   },
>  };
> +static struct vcpu_config vregs_pmu_config = {
> + .sublists = {
> + BASE_SUBLIST,
> + VREGS_SUBLIST,
> + PMU_SUBLIST,
> + {0},
> + },
> +};
>  static struct vcpu_config sve_config = {
>   .sublists = {
>   BASE_SUBLIST,
> @@ -1034

Re: [PATCH v5 6/6] KVM: arm64: Distinguish cases of memcache allocations completely

2021-06-02 Thread Quentin Perret
On Thursday 15 Apr 2021 at 19:50:32 (+0800), Yanan Wang wrote:
> With a guest translation fault, the memcache pages are not needed if KVM
> is only about to install a new leaf entry into the existing page table.
> And with a guest permission fault, the memcache pages are also not needed
> for a write_fault in dirty-logging time if KVM is only about to update
> the existing leaf entry instead of collapsing a block entry into a table.
> 
> By comparing fault_granule and vma_pagesize, cases that require allocations
> from memcache and cases that don't can be distinguished completely.
> 
> Signed-off-by: Yanan Wang 
> ---
>  arch/arm64/kvm/mmu.c | 25 -
>  1 file changed, 12 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index aa536392b308..9e35aa5d29f2 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -895,19 +895,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
>   gfn = fault_ipa >> PAGE_SHIFT;
>   mmap_read_unlock(current->mm);
>  
> - /*
> -  * Permission faults just need to update the existing leaf entry,
> -  * and so normally don't require allocations from the memcache. The
> -  * only exception to this is when dirty logging is enabled at runtime
> -  * and a write fault needs to collapse a block entry into a table.
> -  */
> - if (fault_status != FSC_PERM || (logging_active && write_fault)) {
> - ret = kvm_mmu_topup_memory_cache(memcache,
> -  kvm_mmu_cache_min_pages(kvm));
> - if (ret)
> - return ret;
> - }
> -
>   mmu_seq = vcpu->kvm->mmu_notifier_seq;
>   /*
>* Ensure the read of mmu_notifier_seq happens before we call
> @@ -970,6 +957,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
>   else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
>   prot |= KVM_PGTABLE_PROT_X;
>  
> + /*
> +  * Allocations from the memcache are required only when granule of the
> +  * lookup level where the guest fault happened exceeds vma_pagesize,
> +  * which means new page tables will be created in the fault handlers.
> +  */
> + if (fault_granule > vma_pagesize) {
> + ret = kvm_mmu_topup_memory_cache(memcache,
> +  kvm_mmu_cache_min_pages(kvm));
> + if (ret)
> + return ret;
> + }

You're now doing the top-up in the kvm->mmu_lock critical section. Isn't
this more or less what we try to avoid by using a memory cache?

Thanks,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 5/6] KVM: arm64: Move I-cache flush to the fault handlers

2021-06-02 Thread Quentin Perret
On Thursday 15 Apr 2021 at 19:50:31 (+0800), Yanan Wang wrote:
> In this patch, we move invalidation of I-cache to the fault handlers to

Nit: please avoid using 'This patch' in commit messages, see
Documentation/process/submitting-patches.rst.

> avoid unnecessary I-cache maintenances. On the map path, invalidate the
> I-cache if we are going to create an executable stage-2 mapping for guest.
> And on the permission path, invalidate the I-cache if we are going to add
> an executable permission to the existing guest stage-2 mapping.
> 
> Signed-off-by: Yanan Wang 
> ---
>  arch/arm64/include/asm/kvm_mmu.h | 15 --
>  arch/arm64/kvm/hyp/pgtable.c | 35 +++-
>  arch/arm64/kvm/mmu.c |  9 +---
>  3 files changed, 35 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> b/arch/arm64/include/asm/kvm_mmu.h
> index e9b163c5f023..155492fe5b15 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -187,21 +187,6 @@ static inline bool vcpu_has_cache_enabled(struct 
> kvm_vcpu *vcpu)
>   return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
>  }
>  
> -static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
> -   unsigned long size)
> -{
> - if (icache_is_aliasing()) {
> - /* any kind of VIPT cache */
> - __flush_icache_all();
> - } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
> - /* PIPT or VPIPT at EL2 (see comment in 
> __kvm_tlb_flush_vmid_ipa) */
> - void *va = page_address(pfn_to_page(pfn));
> -
> - invalidate_icache_range((unsigned long)va,
> - (unsigned long)va + size);
> - }
> -}
> -
>  void kvm_set_way_flush(struct kvm_vcpu *vcpu);
>  void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
>  
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index b480f6d1171e..9f4429d80df0 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -568,6 +568,26 @@ static bool stage2_pte_cacheable(struct kvm_pgtable 
> *pgt, kvm_pte_t pte)
>   return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
>  }
>  
> +static bool stage2_pte_executable(kvm_pte_t pte)
> +{
> + return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
> +}
> +
> +static void stage2_invalidate_icache(void *addr, u64 size)
> +{
> + if (icache_is_aliasing()) {
> + /* Any kind of VIPT cache */
> + __flush_icache_all();
> + } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {


> + /*
> +  * See comment in __kvm_tlb_flush_vmid_ipa().
> +  * Invalidate PIPT, or VPIPT at EL2.
> +  */
> + invalidate_icache_range((unsigned long)addr,
> + (unsigned long)addr + size);
> + }
> +}
> +
>  static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
>  u32 level, struct kvm_pgtable_mm_ops *mm_ops)
>  {
> @@ -618,6 +638,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
> u32 level,
>   if (stage2_pte_cacheable(pgt, new) && !stage2_has_fwb(pgt))
>   __flush_dcache_area(mm_ops->phys_to_virt(phys),
>   granule);
> +
> + if (stage2_pte_executable(new))
> + stage2_invalidate_icache(mm_ops->phys_to_virt(phys),
> +  granule);
>   }
>  
>   smp_store_release(ptep, new);
> @@ -896,8 +920,17 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 
> level, kvm_pte_t *ptep,
>* but worst-case the access flag update gets lost and will be
>* set on the next access instead.
>*/
> - if (data->pte != pte)
> + if (data->pte != pte) {
> + /*
> +  * Invalidate the instruction cache before updating
> +  * if we are going to add the executable permission
> +  * for the guest stage-2 PTE.
> +  */
> + if (!stage2_pte_executable(*ptep) && stage2_pte_executable(pte))
> + stage2_invalidate_icache(kvm_pte_follow(pte, 
> data->mm_ops),
> +  kvm_granule_size(level));
>   WRITE_ONCE(*ptep, pte);
> + }

As for the dcache stuff, it seems like this would be best placed in an
optional mm_ops callback, and have the kernel implement it.

Thanks,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 2/6] KVM: arm64: Move D-cache flush to the fault handlers

2021-06-02 Thread Quentin Perret
On Wednesday 02 Jun 2021 at 11:19:49 (+0100), Marc Zyngier wrote:
> On Thu, 15 Apr 2021 12:50:28 +0100,
> > @@ -583,6 +589,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 
> > end, u32 level,
> >  {
> > kvm_pte_t new, old = *ptep;
> > u64 granule = kvm_granule_size(level), phys = data->phys;
> > +   struct kvm_pgtable *pgt = data->mmu->pgt;
> > struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
> >  
> > if (!kvm_block_mapping_supported(addr, end, phys, level))
> > @@ -606,6 +613,13 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 
> > end, u32 level,
> > stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
> > }
> >  
> > +   /* Perform CMOs before installation of the guest stage-2 PTE */
> > +   if (pgt->flags & KVM_PGTABLE_S2_GUEST) {
> > +   if (stage2_pte_cacheable(pgt, new) && !stage2_has_fwb(pgt))
> > +   __flush_dcache_area(mm_ops->phys_to_virt(phys),
> > +   granule);
> > +   }
> 
> Rather than this, why not provide new callbacks in mm_ops, even if we
> have to provide one that is specific to guests (and let the protected
> stuff do its own thing)?

Ack, an optional callback in the mm_ops sounds much nicer.

> One thing I really dislike though is that the page-table code is
> starting to be littered with things that are not directly related to
> page tables. We are re-creating the user_mem_abort() mess in a
> different place.

+1, we should probably keep the page-table code as close as possible
to a standalone and architecturally-compliant library as opposed to a
mess of unrelated logic, simply because that will lead to a cleaner and
more maintainable design in the long run, and because that will ease the
interoperability with EL2 in protected mode.

Thanks,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 1/6] KVM: arm64: Introduce KVM_PGTABLE_S2_GUEST stage-2 flag

2021-06-02 Thread Quentin Perret
Hi Yanan,

On Thursday 15 Apr 2021 at 19:50:27 (+0800), Yanan Wang wrote:
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
> b/arch/arm64/include/asm/kvm_pgtable.h
> index c3674c47d48c..a43cbe697b37 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -61,10 +61,12 @@ struct kvm_pgtable_mm_ops {
>   * @KVM_PGTABLE_S2_NOFWB:Don't enforce Normal-WB even if the CPUs have
>   *   ARM64_HAS_STAGE2_FWB.
>   * @KVM_PGTABLE_S2_IDMAP:Only use identity mappings.
> + * @KVM_PGTABLE_S2_GUEST:Whether the page-tables are guest stage-2.
>   */
>  enum kvm_pgtable_stage2_flags {
>   KVM_PGTABLE_S2_NOFWB= BIT(0),
>   KVM_PGTABLE_S2_IDMAP= BIT(1),
> + KVM_PGTABLE_S2_GUEST= BIT(2),

Assuming that we need this flag (maybe not given Marc's suggestion on
another patch), I'd recommend re-naming it to explain _what_ it does,
rather than _who_ is using it.

That's the principle we followed for e.g. KVM_PGTABLE_S2_IDMAP, so we
should be consistent here as well.

Thanks,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 3/7] KVM: arm64: Remove list_head from hyp_page

2021-06-02 Thread Quentin Perret
The list_head member of struct hyp_page is only needed when the page is
attached to a free-list, which by definition implies the page is free.
As such, nothing prevents us from using the page itself to store the
list_head, hence reducing the size of the vmemmap.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/include/nvhe/memory.h |  1 -
 arch/arm64/kvm/hyp/nvhe/page_alloc.c | 39 
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h 
b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index fd78bde939ee..7691ab495eb4 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -12,7 +12,6 @@ struct hyp_page {
unsigned int refcount;
unsigned int order;
struct hyp_pool *pool;
-   struct list_head node;
 };
 
 extern u64 __hyp_vmemmap;
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c 
b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index 2602577daa00..34f0eb026dd2 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -62,6 +62,34 @@ static struct hyp_page *__find_buddy_avail(struct hyp_pool 
*pool,
 
 }
 
+/*
+ * Pages that are available for allocation are tracked in free-lists, so we use
+ * the pages themselves to store the list nodes to avoid wasting space. As the
+ * allocator always returns zeroed pages (which are zeroed on the 
hyp_put_page()
+ * path to optimize allocation speed), we also need to clean-up the list node 
in
+ * each page when we take it out of the list.
+ */
+static inline void page_remove_from_list(struct hyp_page *p)
+{
+   struct list_head *node = hyp_page_to_virt(p);
+
+   __list_del_entry(node);
+   memset(node, 0, sizeof(*node));
+}
+
+static inline void page_add_to_list(struct hyp_page *p, struct list_head *head)
+{
+   struct list_head *node = hyp_page_to_virt(p);
+
+   INIT_LIST_HEAD(node);
+   list_add_tail(node, head);
+}
+
+static inline struct hyp_page *node_to_page(struct list_head *node)
+{
+   return hyp_virt_to_page(node);
+}
+
 static void __hyp_attach_page(struct hyp_pool *pool,
  struct hyp_page *p)
 {
@@ -83,14 +111,14 @@ static void __hyp_attach_page(struct hyp_pool *pool,
break;
 
/* Take the buddy out of its list, and coallesce with @p */
-   list_del_init(&buddy->node);
+   page_remove_from_list(buddy);
buddy->order = HYP_NO_ORDER;
p = min(p, buddy);
}
 
/* Mark the new head, and insert it */
p->order = order;
-   list_add_tail(&p->node, &pool->free_area[order]);
+   page_add_to_list(p, &pool->free_area[order]);
 }
 
 static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
@@ -99,7 +127,7 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool 
*pool,
 {
struct hyp_page *buddy;
 
-   list_del_init(&p->node);
+   page_remove_from_list(p);
while (p->order > order) {
/*
 * The buddy of order n - 1 currently has HYP_NO_ORDER as it
@@ -110,7 +138,7 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool 
*pool,
p->order--;
buddy = __find_buddy_nocheck(pool, p, p->order);
buddy->order = p->order;
-   list_add_tail(&buddy->node, &pool->free_area[buddy->order]);
+   page_add_to_list(buddy, &pool->free_area[buddy->order]);
}
 
return p;
@@ -182,7 +210,7 @@ void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int 
order)
}
 
/* Extract it from the tree at the right order */
-   p = list_first_entry(&pool->free_area[i], struct hyp_page, node);
+   p = node_to_page(pool->free_area[i].next);
p = __hyp_extract_page(pool, p, order);
 
hyp_set_page_refcounted(p);
@@ -210,7 +238,6 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned 
int nr_pages,
for (i = 0; i < nr_pages; i++) {
p[i].pool = pool;
p[i].order = 0;
-   INIT_LIST_HEAD(&p[i].node);
hyp_set_page_refcounted(&p[i]);
}
 
-- 
2.32.0.rc0.204.g9fa02ecfa5-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 6/7] KVM: arm64: Use less bits for hyp_page order

2021-06-02 Thread Quentin Perret
The hyp_page order is currently encoded on 4 bytes even though it is
guaranteed to be smaller than this. Make it 2 bytes to reduce the hyp
vmemmap overhead.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/include/nvhe/gfp.h|  6 +++---
 arch/arm64/kvm/hyp/include/nvhe/memory.h |  2 +-
 arch/arm64/kvm/hyp/nvhe/page_alloc.c | 12 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h 
b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
index 3ea7bfb6c380..fb0f523d1492 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -7,7 +7,7 @@
 #include 
 #include 
 
-#define HYP_NO_ORDER   UINT_MAX
+#define HYP_NO_ORDER   USHRT_MAX
 
 struct hyp_pool {
/*
@@ -19,11 +19,11 @@ struct hyp_pool {
struct list_head free_area[MAX_ORDER];
phys_addr_t range_start;
phys_addr_t range_end;
-   unsigned int max_order;
+   unsigned short max_order;
 };
 
 /* Allocation */
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order);
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order);
 void hyp_get_page(struct hyp_pool *pool, void *addr);
 void hyp_put_page(struct hyp_pool *pool, void *addr);
 
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h 
b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index 991636be2f46..3fe34fa30ea4 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -9,7 +9,7 @@
 
 struct hyp_page {
unsigned int refcount;
-   unsigned int order;
+   unsigned short order;
 };
 
 extern u64 __hyp_vmemmap;
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c 
b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index e3689def7033..be07055bbc10 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -32,7 +32,7 @@ u64 __hyp_vmemmap;
  */
 static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
 struct hyp_page *p,
-unsigned int order)
+unsigned short order)
 {
phys_addr_t addr = hyp_page_to_phys(p);
 
@@ -51,7 +51,7 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool 
*pool,
 /* Find a buddy page currently available for allocation */
 static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
   struct hyp_page *p,
-  unsigned int order)
+  unsigned short order)
 {
struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
 
@@ -93,7 +93,7 @@ static inline struct hyp_page *node_to_page(struct list_head 
*node)
 static void __hyp_attach_page(struct hyp_pool *pool,
  struct hyp_page *p)
 {
-   unsigned int order = p->order;
+   unsigned short order = p->order;
struct hyp_page *buddy;
 
memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
@@ -123,7 +123,7 @@ static void __hyp_attach_page(struct hyp_pool *pool,
 
 static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
   struct hyp_page *p,
-  unsigned int order)
+  unsigned short order)
 {
struct hyp_page *buddy;
 
@@ -192,9 +192,9 @@ void hyp_get_page(struct hyp_pool *pool, void *addr)
hyp_spin_unlock(&pool->lock);
 }
 
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order)
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order)
 {
-   unsigned int i = order;
+   unsigned short i = order;
struct hyp_page *p;
 
hyp_spin_lock(&pool->lock);
-- 
2.32.0.rc0.204.g9fa02ecfa5-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 5/7] KVM: arm64: Remove hyp_pool pointer from struct hyp_page

2021-06-02 Thread Quentin Perret
Each struct hyp_page currently contains a pointer to a hyp_pool struct
where the page should be freed if its refcount reaches 0. However, this
information can always be inferred from the context in the EL2 code, so
drop the pointer to save a few bytes in the vmemmap.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/include/nvhe/gfp.h|  4 ++--
 arch/arm64/kvm/hyp/include/nvhe/memory.h |  2 --
 arch/arm64/kvm/hyp/nvhe/mem_protect.c| 14 --
 arch/arm64/kvm/hyp/nvhe/page_alloc.c |  7 ++-
 arch/arm64/kvm/hyp/nvhe/setup.c  | 14 --
 5 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h 
b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
index f2c84e4fa40f..3ea7bfb6c380 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -24,8 +24,8 @@ struct hyp_pool {
 
 /* Allocation */
 void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order);
-void hyp_get_page(void *addr);
-void hyp_put_page(void *addr);
+void hyp_get_page(struct hyp_pool *pool, void *addr);
+void hyp_put_page(struct hyp_pool *pool, void *addr);
 
 /* Used pages cannot be freed */
 int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h 
b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index 7691ab495eb4..991636be2f46 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -7,11 +7,9 @@
 
 #include 
 
-struct hyp_pool;
 struct hyp_page {
unsigned int refcount;
unsigned int order;
-   struct hyp_pool *pool;
 };
 
 extern u64 __hyp_vmemmap;
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c 
b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index fdd5b5702e8a..7727252890d8 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -43,6 +43,16 @@ static void *host_s2_zalloc_page(void *pool)
return hyp_alloc_pages(pool, 0);
 }
 
+static void host_s2_get_page(void *addr)
+{
+   hyp_get_page(&host_s2_pool, addr);
+}
+
+static void host_s2_put_page(void *addr)
+{
+   hyp_put_page(&host_s2_pool, addr);
+}
+
 static int prepare_s2_pool(void *pgt_pool_base)
 {
unsigned long nr_pages, pfn;
@@ -60,8 +70,8 @@ static int prepare_s2_pool(void *pgt_pool_base)
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
.page_count = hyp_page_count,
-   .get_page = hyp_get_page,
-   .put_page = hyp_put_page,
+   .get_page = host_s2_get_page,
+   .put_page = host_s2_put_page,
};
 
return 0;
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c 
b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index 34f0eb026dd2..e3689def7033 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -174,20 +174,18 @@ static void __hyp_put_page(struct hyp_pool *pool, struct 
hyp_page *p)
  * section to guarantee transient states (e.g. a page with null refcount but
  * not yet attached to a free list) can't be observed by well-behaved readers.
  */
-void hyp_put_page(void *addr)
+void hyp_put_page(struct hyp_pool *pool, void *addr)
 {
struct hyp_page *p = hyp_virt_to_page(addr);
-   struct hyp_pool *pool = hyp_page_to_pool(p);
 
hyp_spin_lock(&pool->lock);
__hyp_put_page(pool, p);
hyp_spin_unlock(&pool->lock);
 }
 
-void hyp_get_page(void *addr)
+void hyp_get_page(struct hyp_pool *pool, void *addr)
 {
struct hyp_page *p = hyp_virt_to_page(addr);
-   struct hyp_pool *pool = hyp_page_to_pool(p);
 
hyp_spin_lock(&pool->lock);
hyp_page_ref_inc(p);
@@ -236,7 +234,6 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned 
int nr_pages,
/* Init the vmemmap portion */
p = hyp_phys_to_page(phys);
for (i = 0; i < nr_pages; i++) {
-   p[i].pool = pool;
p[i].order = 0;
hyp_set_page_refcounted(&p[i]);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 709cb3d19eb7..dee099871865 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -137,6 +137,16 @@ static void *hyp_zalloc_hyp_page(void *arg)
return hyp_alloc_pages(&hpool, 0);
 }
 
+static void hpool_get_page(void *addr)
+{
+   hyp_get_page(&hpool, addr);
+}
+
+static void hpool_put_page(void *addr)
+{
+   hyp_put_page(&hpool, addr);
+}
+
 void __noreturn __pkvm_init_finalise(void)
 {
struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
@@ -160,8 +170,8 @@ void __noreturn __pkvm_init_finalise(void)
.zalloc_page = hyp_zalloc_hyp_page,
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
-   .get_page = hyp_get_page,
-   .put_page = hyp_put_page,
+   .

[PATCH v2 4/7] KVM: arm64: Unify MMIO and mem host stage-2 pools

2021-06-02 Thread Quentin Perret
We currently maintain two separate memory pools for the host stage-2,
one for pages used in the page-table when mapping memory regions, and
the other to map MMIO regions. The former is large enough to map all of
memory with page granularity and the latter can cover an arbitrary
portion of IPA space, but allows to 'recycle' pages.

However, this split makes accounting difficult to manage as pages at
intermediate levels of the page-table may be used to map both memory and
MMIO regions. Simplify the scheme by merging both pools into one. This
means we can now hit the -ENOMEM case in the memory abort path, but
we're still guaranteed forward-progress in the worst case by unmapping
MMIO regions. On the plus side this also means we can usually map a lot
more MMIO space at once if memory ranges happen to be mapped with block
mappings.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  2 +-
 arch/arm64/kvm/hyp/include/nvhe/mm.h  | 13 +++---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 46 ---
 arch/arm64/kvm/hyp/nvhe/setup.c   | 16 ++-
 arch/arm64/kvm/hyp/reserved_mem.c |  3 +-
 5 files changed, 32 insertions(+), 48 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h 
b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 42d81ec739fa..9c227d87c36d 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -23,7 +23,7 @@ extern struct host_kvm host_kvm;
 int __pkvm_prot_finalize(void);
 int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
 
-int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool);
+int kvm_host_prepare_stage2(void *pgt_pool_base);
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
 
 static __always_inline void __load_host_stage2(void)
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h 
b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index 0095f6289742..8ec3a5a7744b 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -78,19 +78,20 @@ static inline unsigned long hyp_s1_pgtable_pages(void)
return res;
 }
 
-static inline unsigned long host_s2_mem_pgtable_pages(void)
+static inline unsigned long host_s2_pgtable_pages(void)
 {
+   unsigned long res;
+
/*
 * Include an extra 16 pages to safely upper-bound the worst case of
 * concatenated pgds.
 */
-   return __hyp_pgtable_total_pages() + 16;
-}
+   res = __hyp_pgtable_total_pages() + 16;
 
-static inline unsigned long host_s2_dev_pgtable_pages(void)
-{
/* Allow 1 GiB for MMIO mappings */
-   return __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+   res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+
+   return res;
 }
 
 #endif /* __KVM_HYP_MM_H */
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c 
b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index e342f7f4f4fb..fdd5b5702e8a 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -23,8 +23,7 @@
 extern unsigned long hyp_nr_cpus;
 struct host_kvm host_kvm;
 
-struct hyp_pool host_s2_mem;
-struct hyp_pool host_s2_dev;
+struct hyp_pool host_s2_pool;
 
 /*
  * Copies of the host's CPU features registers holding sanitized values.
@@ -36,7 +35,7 @@ static const u8 pkvm_hyp_id = 1;
 
 static void *host_s2_zalloc_pages_exact(size_t size)
 {
-   return hyp_alloc_pages(&host_s2_mem, get_order(size));
+   return hyp_alloc_pages(&host_s2_pool, get_order(size));
 }
 
 static void *host_s2_zalloc_page(void *pool)
@@ -44,20 +43,14 @@ static void *host_s2_zalloc_page(void *pool)
return hyp_alloc_pages(pool, 0);
 }
 
-static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
+static int prepare_s2_pool(void *pgt_pool_base)
 {
unsigned long nr_pages, pfn;
int ret;
 
-   pfn = hyp_virt_to_pfn(mem_pgt_pool);
-   nr_pages = host_s2_mem_pgtable_pages();
-   ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0);
-   if (ret)
-   return ret;
-
-   pfn = hyp_virt_to_pfn(dev_pgt_pool);
-   nr_pages = host_s2_dev_pgtable_pages();
-   ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0);
+   pfn = hyp_virt_to_pfn(pgt_pool_base);
+   nr_pages = host_s2_pgtable_pages();
+   ret = hyp_pool_init(&host_s2_pool, pfn, nr_pages, 0);
if (ret)
return ret;
 
@@ -86,7 +79,7 @@ static void prepare_host_vtcr(void)
  id_aa64mmfr1_el1_sys_val, phys_shift);
 }
 
-int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
+int kvm_host_prepare_stage2(void *pgt_pool_base)
 {
struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
int ret;
@@ -94,7 +87,7 @@ int kvm_host_prepare_stage2(void *mem_pgt_pool, void 
*dev_pgt_pool)
prepare_host_vtcr();
hyp_spin_lock_init(&host_kvm.lock);
 
-   ret = prepare_s2_pools(mem_pgt_pool, dev_pgt

[PATCH v2 1/7] KVM: arm64: Move hyp_pool locking out of refcount helpers

2021-06-02 Thread Quentin Perret
The hyp_page refcount helpers currently rely on the hyp_pool lock for
serialization. However, this means the refcounts can't be changed from
the buddy allocator core as it already holds the lock, which means pages
have to go through odd transient states.

For example, when a page is freed, its refcount is set to 0, and the
lock is transiently released before the page can be attached to a free
list in the buddy tree. This is currently harmless as the allocator
checks the list node of each page to see if it is available for
allocation or not, but it means the page refcount can't be trusted to
represent the state of the page even if the pool lock is held.

In order to fix this, remove the pool locking from the refcount helpers,
and move all the logic to the buddy allocator. This will simplify the
removal of the list node from struct hyp_page in a later patch.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/include/nvhe/gfp.h | 35 --
 arch/arm64/kvm/hyp/nvhe/page_alloc.c  | 43 ---
 2 files changed, 32 insertions(+), 46 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h 
b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
index 18a4494337bd..f2c84e4fa40f 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -22,41 +22,6 @@ struct hyp_pool {
unsigned int max_order;
 };
 
-static inline void hyp_page_ref_inc(struct hyp_page *p)
-{
-   struct hyp_pool *pool = hyp_page_to_pool(p);
-
-   hyp_spin_lock(&pool->lock);
-   p->refcount++;
-   hyp_spin_unlock(&pool->lock);
-}
-
-static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
-{
-   struct hyp_pool *pool = hyp_page_to_pool(p);
-   int ret;
-
-   hyp_spin_lock(&pool->lock);
-   p->refcount--;
-   ret = (p->refcount == 0);
-   hyp_spin_unlock(&pool->lock);
-
-   return ret;
-}
-
-static inline void hyp_set_page_refcounted(struct hyp_page *p)
-{
-   struct hyp_pool *pool = hyp_page_to_pool(p);
-
-   hyp_spin_lock(&pool->lock);
-   if (p->refcount) {
-   hyp_spin_unlock(&pool->lock);
-   BUG();
-   }
-   p->refcount = 1;
-   hyp_spin_unlock(&pool->lock);
-}
-
 /* Allocation */
 void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order);
 void hyp_get_page(void *addr);
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c 
b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index 237e03bf0cb1..d666f4789e31 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -93,15 +93,6 @@ static void __hyp_attach_page(struct hyp_pool *pool,
list_add_tail(&p->node, &pool->free_area[order]);
 }
 
-static void hyp_attach_page(struct hyp_page *p)
-{
-   struct hyp_pool *pool = hyp_page_to_pool(p);
-
-   hyp_spin_lock(&pool->lock);
-   __hyp_attach_page(pool, p);
-   hyp_spin_unlock(&pool->lock);
-}
-
 static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
   struct hyp_page *p,
   unsigned int order)
@@ -125,19 +116,49 @@ static struct hyp_page *__hyp_extract_page(struct 
hyp_pool *pool,
return p;
 }
 
+static inline void hyp_page_ref_inc(struct hyp_page *p)
+{
+   p->refcount++;
+}
+
+static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
+{
+   p->refcount--;
+   return (p->refcount == 0);
+}
+
+static inline void hyp_set_page_refcounted(struct hyp_page *p)
+{
+   BUG_ON(p->refcount);
+   p->refcount = 1;
+}
+
+/*
+ * Changes to the buddy tree and page refcounts must be done with the hyp_pool
+ * lock held. If a refcount change requires an update to the buddy tree (e.g.
+ * hyp_put_page()), both operations must be done within the same critical
+ * section to guarantee transient states (e.g. a page with null refcount but
+ * not yet attached to a free list) can't be observed by well-behaved readers.
+ */
 void hyp_put_page(void *addr)
 {
struct hyp_page *p = hyp_virt_to_page(addr);
+   struct hyp_pool *pool = hyp_page_to_pool(p);
 
+   hyp_spin_lock(&pool->lock);
if (hyp_page_ref_dec_and_test(p))
-   hyp_attach_page(p);
+   __hyp_attach_page(pool, p);
+   hyp_spin_unlock(&pool->lock);
 }
 
 void hyp_get_page(void *addr)
 {
struct hyp_page *p = hyp_virt_to_page(addr);
+   struct hyp_pool *pool = hyp_page_to_pool(p);
 
+   hyp_spin_lock(&pool->lock);
hyp_page_ref_inc(p);
+   hyp_spin_unlock(&pool->lock);
 }
 
 void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order)
@@ -159,8 +180,8 @@ void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int 
order)
p = list_first_entry(&pool->free_area[i], struct hyp_page, node);
p = __hyp_extract_page(pool, p, order);
 
-   hyp_spin_unlock(&pool->lock);
hyp_set_page_refcounted(p);
+   hyp_spin_unlock(&pool->lock);
 
return hyp_page_to_virt(p);

[PATCH v2 7/7] KVM: arm64: Use less bits for hyp_page refcount

2021-06-02 Thread Quentin Perret
The hyp_page refcount is currently encoded on 4 bytes even though we
never need to count that many objects in a page. Make it 2 bytes to save
some space in the vmemmap.

As overflows are more likely to happen as well, make sure to catch those
with a BUG in the increment function.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/include/nvhe/memory.h | 2 +-
 arch/arm64/kvm/hyp/nvhe/page_alloc.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h 
b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index 3fe34fa30ea4..592b7edb3edb 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -8,7 +8,7 @@
 #include 
 
 struct hyp_page {
-   unsigned int refcount;
+   unsigned short refcount;
unsigned short order;
 };
 
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c 
b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index be07055bbc10..41fc25bdfb34 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -146,6 +146,7 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool 
*pool,
 
 static inline void hyp_page_ref_inc(struct hyp_page *p)
 {
+   BUG_ON(p->refcount == USHRT_MAX);
p->refcount++;
 }
 
-- 
2.32.0.rc0.204.g9fa02ecfa5-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 0/7] KVM: arm64: Reduce hyp_vmemmap overhead

2021-06-02 Thread Quentin Perret
Hi all,

This is a v2 of the patch series previously posted here:

  https://lore.kernel.org/r/20210527125134.2116404-1-qper...@google.com/

Please refer to the cover letter of v1 for the context and motivation
behind the series.

Changes since v1:
 - Added comment/doc in page_alloc.c explaining the locking expectations;
 - Removed unnecessary casts in the 'page-to-list-node' helpers;
 - A few other cosmetic changes in the allocator.

Thanks,
Quentin

Quentin Perret (7):
  KVM: arm64: Move hyp_pool locking out of refcount helpers
  KVM: arm64: Use refcount at hyp to check page availability
  KVM: arm64: Remove list_head from hyp_page
  KVM: arm64: Unify MMIO and mem host stage-2 pools
  KVM: arm64: Remove hyp_pool pointer from struct hyp_page
  KVM: arm64: Use less bits for hyp_page order
  KVM: arm64: Use less bits for hyp_page refcount

 arch/arm64/kvm/hyp/include/nvhe/gfp.h |  45 +--
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   2 +-
 arch/arm64/kvm/hyp/include/nvhe/memory.h  |   7 +-
 arch/arm64/kvm/hyp/include/nvhe/mm.h  |  13 +-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c |  60 +-
 arch/arm64/kvm/hyp/nvhe/page_alloc.c  | 112 +-
 arch/arm64/kvm/hyp/nvhe/setup.c   |  30 +++--
 arch/arm64/kvm/hyp/reserved_mem.c |   3 +-
 8 files changed, 145 insertions(+), 127 deletions(-)

-- 
2.32.0.rc0.204.g9fa02ecfa5-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v2 2/7] KVM: arm64: Use refcount at hyp to check page availability

2021-06-02 Thread Quentin Perret
The hyp buddy allocator currently checks the struct hyp_page list node
to see if a page is available for allocation or not when trying to
coalesce memory. Now that decrementing the refcount and attaching to
the buddy tree is done in the same critical section, we can rely on the
refcount of the buddy page to be in sync, which allows to replace the
list node check by a refcount check. This will ease removing the list
node from struct hyp_page later on.

Signed-off-by: Quentin Perret 
---
 arch/arm64/kvm/hyp/nvhe/page_alloc.c | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c 
b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index d666f4789e31..2602577daa00 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -55,7 +55,7 @@ static struct hyp_page *__find_buddy_avail(struct hyp_pool 
*pool,
 {
struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
 
-   if (!buddy || buddy->order != order || list_empty(&buddy->node))
+   if (!buddy || buddy->order != order || buddy->refcount)
return NULL;
 
return buddy;
@@ -133,6 +133,12 @@ static inline void hyp_set_page_refcounted(struct hyp_page 
*p)
p->refcount = 1;
 }
 
+static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p)
+{
+   if (hyp_page_ref_dec_and_test(p))
+   __hyp_attach_page(pool, p);
+}
+
 /*
  * Changes to the buddy tree and page refcounts must be done with the hyp_pool
  * lock held. If a refcount change requires an update to the buddy tree (e.g.
@@ -146,8 +152,7 @@ void hyp_put_page(void *addr)
struct hyp_pool *pool = hyp_page_to_pool(p);
 
hyp_spin_lock(&pool->lock);
-   if (hyp_page_ref_dec_and_test(p))
-   __hyp_attach_page(pool, p);
+   __hyp_put_page(pool, p);
hyp_spin_unlock(&pool->lock);
 }
 
@@ -202,15 +207,16 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, 
unsigned int nr_pages,
 
/* Init the vmemmap portion */
p = hyp_phys_to_page(phys);
-   memset(p, 0, sizeof(*p) * nr_pages);
for (i = 0; i < nr_pages; i++) {
p[i].pool = pool;
+   p[i].order = 0;
INIT_LIST_HEAD(&p[i].node);
+   hyp_set_page_refcounted(&p[i]);
}
 
/* Attach the unused pages to the buddy tree */
for (i = reserved_pages; i < nr_pages; i++)
-   __hyp_attach_page(pool, &p[i]);
+   __hyp_put_page(pool, &p[i]);
 
return 0;
 }
-- 
2.32.0.rc0.204.g9fa02ecfa5-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 3/7] KVM: arm64: Remove list_head from hyp_page

2021-06-02 Thread Quentin Perret
On Tuesday 01 Jun 2021 at 18:41:49 (+0100), Marc Zyngier wrote:
> On Tue, 01 Jun 2021 16:48:06 +0100,
> Quentin Perret  wrote:
> > 
> > On Tuesday 01 Jun 2021 at 15:38:22 (+0100), Marc Zyngier wrote:
> > > On Thu, 27 May 2021 13:51:30 +0100,
> > > Quentin Perret  wrote:
> > > > +/*
> > > > + * Pages that are available for allocation are tracked in free-lists, 
> > > > so we use
> > > > + * the pages themselves to store the list nodes to avoid wasting 
> > > > space. As the
> > > > + * allocator always returns zeroed pages (which are zeroed on the 
> > > > hyp_put_page()
> > > > + * path to optimize allocation speed), we also need to clean-up the 
> > > > list node in
> > > > + * each page when we take it out of the list.
> > > > + */
> > > > +static inline void page_remove_from_list(struct hyp_page *p)
> > > > +{
> > > > +   struct list_head *node = (struct list_head 
> > > > *)hyp_page_to_virt(p);
> > > 
> > > Nit: How about changing hyp_page_to_virt() so that it returns a
> > > convenient 'void *', and get rid of the ugly casts?
> > 
> > It should already return void *, but I kind of liked the explicit cast
> > here for documentation purpose. We're turning a 'random' piece of unused
> > memory into a typed object, so that felt like a useful annotation. Happy
> > to get rid of it though.
> 
> My expectations were that using hyp_page_to_virt() already serves as a
> pretty big warning that we're doing something unusual.
> 
> I guess that if we want to be really careful about those, we should
> then be consistent and make it return a uintptr_t (or unsigned long)
> instead, actively making use of the cast, consistently, everywhere.

Right, so I'd prefer keeping it void * for consistency with the EL1
equivalent -- much of this aims to be API compatible with the Linux
stuff. I'll get rid of the cast and post a v2 shortly.

Cheers,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 4/6] KVM: arm64: Provide invalidate_icache_range at non-VHE EL2

2021-06-02 Thread Marc Zyngier
On Thu, 15 Apr 2021 12:50:30 +0100,
Yanan Wang  wrote:
> 
> We want to move I-cache maintenance for the guest to the stage-2
> page table code for performance improvement. Before it can work,
> we should first make function invalidate_icache_range available
> to non-VHE EL2 to avoid compiling or program running error, as
> pgtable.c is now linked into the non-VHE EL2 code for pKVM mode.
> 
> In this patch, we only introduce symbol of invalidate_icache_range
> with no real functionality in nvhe/cache.S, because there haven't
> been situations found currently where I-cache maintenance is also
> needed in non-VHE EL2 for pKVM mode.
> 
> Signed-off-by: Yanan Wang 
> ---
>  arch/arm64/kvm/hyp/nvhe/cache.S | 11 +++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S
> index 36cef6915428..a125ec9aeed2 100644
> --- a/arch/arm64/kvm/hyp/nvhe/cache.S
> +++ b/arch/arm64/kvm/hyp/nvhe/cache.S
> @@ -11,3 +11,14 @@ SYM_FUNC_START_PI(__flush_dcache_area)
>   dcache_by_line_op civac, sy, x0, x1, x2, x3
>   ret
>  SYM_FUNC_END_PI(__flush_dcache_area)
> +
> +/*
> + *   invalidate_icache_range(start,end)
> + *
> + *   Ensure that the I cache is invalid within specified region.
> + *
> + *   - start   - virtual start address of region
> + *   - end - virtual end address of region
> + */
> +SYM_FUNC_START(invalidate_icache_range)
> +SYM_FUNC_END(invalidate_icache_range)

This is a good indication that something is really wrong.

If you were to provide cache management callbacks as part of the
mm_ops themselves (or a similar abstraction), you wouldn't have to do
these things.

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v5 2/6] KVM: arm64: Move D-cache flush to the fault handlers

2021-06-02 Thread Marc Zyngier
On Thu, 15 Apr 2021 12:50:28 +0100,
Yanan Wang  wrote:
> 
> We currently uniformly permorm CMOs of D-cache and I-cache in function
> user_mem_abort before calling the fault handlers. If we get concurrent
> guest faults(e.g. translation faults, permission faults) or some really
> unnecessary guest faults caused by BBM, CMOs for the first vcpu are
> necessary while the others later are not.
> 
> By moving CMOs to the fault handlers, we can easily identify conditions
> where they are really needed and avoid the unnecessary ones. As it's a
> time consuming process to perform CMOs especially when flushing a block
> range, so this solution reduces much load of kvm and improve efficiency
> of the page table code.
> 
> This patch only moves clean of D-cache to the map path, and drop the
> original APIs in mmu.c/mmu.h for D-cache maintenance by using what we
> already have in pgtable.c. Change about the I-side will come from a
> later patch.

But this means that until patch #5, this is broken (invalidation on
the i-side will happen before the clean to PoU on the d-side). You
need to keep the series bisectable and not break things in the middle.
It would be OK to have two set of D-cache CMOs in the interval, for
example.

> 
> Signed-off-by: Yanan Wang 
> ---
>  arch/arm64/include/asm/kvm_mmu.h | 16 
>  arch/arm64/kvm/hyp/pgtable.c | 20 ++--
>  arch/arm64/kvm/mmu.c | 14 +++---
>  3 files changed, 17 insertions(+), 33 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> b/arch/arm64/include/asm/kvm_mmu.h
> index 25ed956f9af1..e9b163c5f023 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -187,22 +187,6 @@ static inline bool vcpu_has_cache_enabled(struct 
> kvm_vcpu *vcpu)
>   return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
>  }
>  
> -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long 
> size)
> -{
> - void *va = page_address(pfn_to_page(pfn));
> -
> - /*
> -  * With FWB, we ensure that the guest always accesses memory using
> -  * cacheable attributes, and we don't have to clean to PoC when
> -  * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
> -  * PoU is not required either in this case.
> -  */
> - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
> - return;
> -
> - kvm_flush_dcache_to_poc(va, size);
> -}
> -
>  static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
> unsigned long size)
>  {
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index c37c1dc4feaf..e3606c9dcec7 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -562,6 +562,12 @@ static bool stage2_pte_is_counted(kvm_pte_t pte)
>   return !!pte;
>  }
>  
> +static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
> +{
> + u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
> + return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
> +}
> +
>  static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
>  u32 level, struct kvm_pgtable_mm_ops *mm_ops)
>  {
> @@ -583,6 +589,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
> u32 level,
>  {
>   kvm_pte_t new, old = *ptep;
>   u64 granule = kvm_granule_size(level), phys = data->phys;
> + struct kvm_pgtable *pgt = data->mmu->pgt;
>   struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
>  
>   if (!kvm_block_mapping_supported(addr, end, phys, level))
> @@ -606,6 +613,13 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
> u32 level,
>   stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
>   }
>  
> + /* Perform CMOs before installation of the guest stage-2 PTE */
> + if (pgt->flags & KVM_PGTABLE_S2_GUEST) {
> + if (stage2_pte_cacheable(pgt, new) && !stage2_has_fwb(pgt))
> + __flush_dcache_area(mm_ops->phys_to_virt(phys),
> + granule);
> + }

Rather than this, why not provide new callbacks in mm_ops, even if we
have to provide one that is specific to guests (and let the protected
stuff do its own thing)?

One thing I really dislike though is that the page-table code is
starting to be littered with things that are not directly related to
page tables. We are re-creating the user_mem_abort() mess in a
different place.

> +
>   smp_store_release(ptep, new);
>   if (stage2_pte_is_counted(new))
>   mm_ops->get_page(ptep);
> @@ -798,12 +812,6 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable 
> *pgt, u64 addr, u64 size,
>   return ret;
>  }
>  
> -static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
> -{
> - u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
> - return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
> -}
> -
>  s

Re: [RFC 1/2] vfio/pci: keep the prefetchable attribute of a BAR region in VMA

2021-06-02 Thread Marc Zyngier
Hi Shanker,

On Sat, 08 May 2021 17:33:11 +0100,
Shanker R Donthineni  wrote:
> 
> Hi Marc,
> 
> On 5/5/21 1:02 PM, Catalin Marinas wrote:
> >>> Will/Catalin, perhaps you could explain your thought process on why you 
> >>> chose
> >>> Normal NC for ioremap_wc on the armv8 linux port instead of Device GRE or 
> >>> other
> >>> Device Gxx.
> >> I think a combination of: compatibility with 32-bit Arm, the need to
> >> support unaligned accesses and the potential for higher performance.
> > IIRC the _wc suffix also matches the pgprot_writecombine() used by some
> > drivers to map a video framebuffer into user space. Accesses to the
> > framebuffer are not guaranteed to be aligned (memset/memcpy don't ensure
> > alignment on arm64 and the user doesn't have a memset_io or memcpy_toio).
> >
> >> Furthermore, ioremap() already gives you a Device memory type, and we're
> >> tight on MAIR space.
> > We have MT_DEVICE_GRE currently reserved though no in-kernel user, we
> > might as well remove it.
> @Marc, Could you provide your thoughts/guidance for the next step? The
> proposal of getting hints for prefetchable regions from VFIO/QEMU is not
> recommended, The only option left is to implement ARM64 dependent logic
> in KVM.
> 
> Option-1: I think we could take advantage of stage-1/2 combining rules to
> allow NORMAL_NC memory-type for device memory in VM. Always map
> device memory at stage-2 as NORMAL-NC and trust VM's stage-1 MT.
> 
> ---
> Stage-2 MT Stage-1 MT    Resultant MT (combining-rules/FWB)
> ---
> Normal-NC  Normal-WT   Normal-NC
>    -   Normal-WB  -
>    -   Normal-NC  -
>    -   Device-   Device-
> ---

I think this is unwise.

Will recently debugged a pretty horrible situation when doing exactly
that: when S1 is off and S2 is on, the I-side is allowed to generate
speculative accesses (see ARMv8 ARM G.a D5.2.9 for the details). And
yes, implementations definitely do that. Add side-effect reads to the
mix, and you're in for a treat.

> We've been using this option internally for testing purpose and
> validated with NVME/Mellanox/GPU pass-through devices on
> Marvell-Thundex2 platform.

See above. It *will* break eventually.

> Option-2: Get resource properties associated with MMIO using lookup_resource()
> and map at stage-2 as Normal-NC if IORESOURCE_PREFETCH is set in flags.

That's a pretty roundabout way of doing exactly the same thing you
initially proposed. And it suffers from the exact same problems, which
is that you change the semantics of the mapping without knowing what
the guest's intent is.

M.

-- 
Without deviation from the norm, progress is not possible.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC 1/2] vfio/pci: keep the prefetchable attribute of a BAR region in VMA

2021-06-02 Thread Marc Zyngier
On Tue, 04 May 2021 19:03:48 +0100,
Alex Williamson  wrote:
> 
> On Mon, 3 May 2021 22:03:59 +
> Vikram Sethi  wrote:
> 
> > Hi Alex,
> > > From: Alex Williamson 
> > > On Mon, 3 May 2021 13:59:43 +
> > > Vikram Sethi  wrote:  
> > > > > From: Mark Kettenis   
> > > > > > From: Marc Zyngier   
> > > >
> > > > snip  
> > > > > > If, by enumerating the properties of Prefetchable, you can show
> > > > > > that they are a strict superset of Normal_NC, I'm on board. I
> > > > > > haven't seen such an enumeration so far.
> > > > > >  
> > > > snip  
> > > > > > Right, so we have made a small step in the direction of mapping
> > > > > > "prefetchable" onto "Normal_NC", thanks for that. What about all
> > > > > > the other properties (unaligned accesses, ordering, gathering)?  
> > > > >  
> > > > Regarding gathering/write combining, that is also allowed to
> > > > prefetchable per PCI spec  
> > > 
> > > As others have stated, gather/write combining itself is not well defined.
> > >   
> > > > From 1.3.2.2 of 5/0 base spec:
> > > > A PCI Express Endpoint requesting memory resources through a BAR must
> > > > set the BAR's Prefetchable bit unless the range contains locations
> > > > with read side-effects or locations in which the Function does not 
> > > > tolerate  
> > > write merging.
> > > 
> > > "write merging"  This is a very specific thing, per PCI 3.0, 3.2.6:
> > > 
> > >   Byte Merging – occurs when a sequence of individual memory writes
> > >   (bytes or words) are merged into a single DWORD.
> > > 
> > > The semantics suggest quadword support in addition to dword, but
> > > don't require it.  Writes to bytes within a dword can be merged,
> > > but duplicate writes cannot.
> > > 
> > > It seems like an extremely liberal application to suggest that
> > > this one write semantic encompasses full write combining
> > > semantics, which itself is not clearly defined.
> > >  
> > Talking to our PCIe SIG representative, PCIe switches are not
> > allowed do any of the byte Merging/combining etc as defined in the
> > PCI spec, and per a rather poorly worded Implementation note in
> > the spec says that no known PCIe Host Briddges/Root ports do it
> > either.  So for PCIe we don't think believe there is any byte
> > merging that happens in the PCIe fabric so it's really a matter of
> > what happens in the CPU core and interconnect before it gets to
> > the PCIe hierarchy.
> 
> Yes, but merged writes, no matter where they happen, are still the only
> type of write combining that a prefetchable BAR on an endpoint is
> required to support.
> 
> > Stepping back from this patchset, do you agree that it is
> > desirable to support Write combining as understood by ioremap_wc
> > to work in all ISA guests including ARMv8?
> 
> Yes, a userspace vfio driver should be able to take advantage of the
> hardware capabilities.  I think where we disagree is whether it's
> universally safe to assume write combining based on the PCI
> prefetchable capability of a BAR.  If that's something that can be
> assumed universally for ARMv8 based on the architecture specification
> compatibility with the PCI definition of a prefetchable BAR, then I
> would expect a helper somewhere in arch code that returns the right
> page protection flags, so that arch maintainers don't need to scour
> device drivers for architecture hacks.  Otherwise, it needs to be
> exposed through the vfio uAPI to allow the userspace device driver
> itself to select these semantics.
> 
> > You note that x86 virtualization doesn't have this issue, but
> > KVM-ARM does because KVM maps all device BARs as Device Memory
> > type nGnRE which doesn't allow ioremap_wc from within the guest to
> > get the actual semantics desired.
> > 
> > Marc and others have suggested that userspace should provide the
> > hints. But the question is how would qemu vfio do this either? We
> > would be stuck in the same arguments as here, as to what is the
> > correct way to determine the desired attributes for a given BAR
> > such that eventually when a driver in the guest asks for
> > ioremap_wc it actually has a chance of working in the guest, in
> > all ISAs.  Do you have any suggestions on how to make progress
> > here?
> 
> We do need some way for userspace drivers to also make use of WC
> semantics, there were some discussions in the past, I think others have
> referenced them as well, but nothing has been proposed for a vfio API.
> 
> If we had that API, QEMU deciding to universally enable WC for all
> vfio prefetchable BARs seems only marginally better than this approach.
> Ultimately the mapping should be based on the guest driver semantics,
> and if you don't have any visibility to that on KVM/arm like we have on
> KVM/x86, then it seems like there's nothing to trigger a vfio API here
> anyway.

There isn't much KVM/arm64 can do here unless it is being told what to
do. We don't have visibility on the guest's page tables in a reliable
way, and trusting them is not something I