from:"Isaku Yamahata"

[RFC PATCH 10/10] KVM: x86/mmu: make FNAME(fetch) receive single argument

2021-04-20 Thread Isaku Yamahata

Convert FNAME(fetch) to receive single argument, struct kvm_page_fault
instead of many arguments.

No functional change is intended.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/kvm/mmu/paging_tmpl.h | 36 +++---
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 7df68b5fdd10..ad01d933f2b7 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -631,20 +631,19 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,
  * If the guest tries to write a write-protected page, we need to
  * emulate this operation, return 1 to indicate this case.
  */
-static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
-struct guest_walker *gw, u32 error_code,
-int max_level, kvm_pfn_t pfn, bool map_writable,
-bool prefault)
+static int FNAME(fetch)(struct kvm_page_fault *kpf,  struct guest_walker *gw)
 {
+   struct kvm_vcpu *vcpu = kpf->vcpu;
+   gpa_t addr = kpf->cr2_or_gpa;
bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
-   bool write_fault = error_code & PFERR_WRITE_MASK;
-   bool exec = error_code & PFERR_FETCH_MASK;
+   bool exec = kpf->error_code & PFERR_FETCH_MASK;
bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
int top_level, level, req_level, ret;
gfn_t base_gfn = gw->gfn;
+   WARN_ON(kpf->gfn != gw->gfn);
 
direct_access = gw->pte_access;
 
@@ -689,10 +688,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
link_shadow_page(vcpu, it.sptep, sp);
}
 
-   level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn,
+   level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, kpf->max_level, 
&kpf->pfn,
huge_page_disallowed, &req_level);
 
-   trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
+   trace_kvm_mmu_spte_requested(addr, gw->level, kpf->pfn);
 
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
clear_sp_write_flooding_count(it.sptep);
@@ -703,7 +702,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
 */
if (nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level,
-  &pfn, &level);
+  &kpf->pfn, &level);
 
base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == level)
@@ -722,8 +721,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
}
}
 
-   ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
-  it.level, base_gfn, pfn, prefault, map_writable);
+   ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, kpf->write_fault,
+  it.level, base_gfn, kpf->pfn, kpf->prefault,
+  kpf->map_writable);
if (ret == RET_PF_SPURIOUS)
return ret;
 
@@ -794,14 +794,11 @@ static int FNAME(page_fault)(struct kvm_page_fault *kpf)
struct kvm_vcpu *vcpu = kpf->vcpu;
gpa_t addr = kpf->cr2_or_gpa;
u32 error_code = kpf->error_code;
-   bool prefault = kpf->prefault;
-   bool write_fault = kpf->write_fault;
bool user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
int r;
unsigned long mmu_seq;
bool is_self_change_mapping;
-   int max_level;
 
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
@@ -821,7 +818,7 @@ static int FNAME(page_fault)(struct kvm_page_fault *kpf)
 */
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
-   if (!prefault)
+   if (!kpf->prefault)
kvm_inject_emulated_page_fault(vcpu, &walker.fault);
 
return RET_PF_RETRY;
@@ -843,9 +840,9 @@ static int FNAME(page_fault)(struct kvm_page_fault *kpf)
  &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
 
if (is_self_change_mapping)
-   max_level = PG_LEVEL_4K;
+   kpf->max_level = PG_LEVEL_4K;
else
-   max_level = walker.level;
+   kpf->max_level = walker.level;
 
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -860,7 +857,7 @@ static int FNAME(page_faul

[RFC PATCH 09/10] KVM: x86/mmu: make kvm_tdp_mmu_map() receive single argument

2021-04-20 Thread Isaku Yamahata

Convert kvm_tdp_mmu_map() to receive single argument, struct kvm_page_fault
instead of many arguments.

No functional change is intended.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/kvm/mmu/mmu.c |  8 +---
 arch/x86/kvm/mmu/tdp_mmu.c | 21 +++--
 arch/x86/kvm/mmu/tdp_mmu.h |  4 +---
 3 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b58afb58430e..ebac766839a9 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3692,11 +3692,6 @@ static bool try_async_pf(struct kvm_page_fault *kpf)
 static int direct_page_fault(struct kvm_page_fault *kpf)
 {
struct kvm_vcpu *vcpu = kpf->vcpu;
-   gpa_t gpa = kpf->cr2_or_gpa;
-   u32 error_code = kpf->error_code;
-   bool prefault = kpf->prefault;
-   int max_level = kpf->max_level;
-
unsigned long mmu_seq;
int r;
 
@@ -3737,8 +3732,7 @@ static int direct_page_fault(struct kvm_page_fault *kpf)
goto out_unlock;
 
if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
-   r = kvm_tdp_mmu_map(vcpu, gpa, error_code, kpf->map_writable,
-   max_level, kpf->pfn, prefault);
+   r = kvm_tdp_mmu_map(kpf);
else
r = __direct_map(kpf);
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 018d82e73e31..13ae4735fc25 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -793,12 +793,11 @@ static int tdp_mmu_map_handle_target_level(struct 
kvm_vcpu *vcpu, int write,
  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
  * page tables and SPTEs to translate the faulting guest physical address.
  */
-int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
-   int map_writable, int max_level, kvm_pfn_t pfn,
-   bool prefault)
+int kvm_tdp_mmu_map(struct kvm_page_fault *kpf)
 {
+   struct kvm_vcpu *vcpu = kpf->vcpu;
+   u32 error_code = kpf->error_code;
bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
-   bool write = error_code & PFERR_WRITE_MASK;
bool exec = error_code & PFERR_FETCH_MASK;
bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -807,7 +806,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 
error_code,
u64 *child_pt;
u64 new_spte;
int ret;
-   gfn_t gfn = gpa >> PAGE_SHIFT;
+   gpa_t gpa = kpf->cr2_or_gpa;
+   gfn_t gfn = kpf->gfn;
int level;
int req_level;
 
@@ -816,17 +816,17 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 
error_code,
if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
return RET_PF_RETRY;
 
-   level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
+   level = kvm_mmu_hugepage_adjust(vcpu, gfn, kpf->max_level, &kpf->pfn,
huge_page_disallowed, &req_level);
 
-   trace_kvm_mmu_spte_requested(gpa, level, pfn);
+   trace_kvm_mmu_spte_requested(gpa, level, kpf->pfn);
 
rcu_read_lock();
 
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
if (nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(iter.old_spte, gfn,
-  iter.level, &pfn, &level);
+  iter.level, &kpf->pfn, 
&level);
 
if (iter.level == level)
break;
@@ -875,8 +875,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 
error_code,
return RET_PF_RETRY;
}
 
-   ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
- pfn, prefault);
+   ret = tdp_mmu_map_handle_target_level(
+   vcpu, kpf->write_fault, kpf->map_writable, &iter, kpf->pfn,
+   kpf->prefault);
rcu_read_unlock();
 
return ret;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 31096ece9b14..cbf63791603d 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -33,9 +33,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct 
kvm_mmu_page *sp)
 }
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
 
-int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
-   int map_writable, int max_level, kvm_pfn_t pfn,
-   bool prefault);
+int kvm_tdp_mmu_map(struct kvm_page_fault *kpf);
 
 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
  unsigned long end);
-- 
2.25.1

[RFC PATCH 08/10] KVM: x86/mmu: make __direct_map() receive single argument

2021-04-20 Thread Isaku Yamahata

Convert __direct_map() to receive single argument, struct kvm_page_fault
instead of many arguments.

No functional change is intended.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/kvm/mmu/mmu.c | 27 ---
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index ce48416380c3..b58afb58430e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2856,27 +2856,26 @@ void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, 
int cur_level,
}
 }
 
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
-   int map_writable, int max_level, kvm_pfn_t pfn,
-   bool prefault, bool is_tdp)
+static int __direct_map(struct kvm_page_fault *kpf)
 {
+   struct kvm_vcpu *vcpu = kpf->vcpu;
bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
-   bool write = error_code & PFERR_WRITE_MASK;
-   bool exec = error_code & PFERR_FETCH_MASK;
+   bool exec = kpf->error_code & PFERR_FETCH_MASK;
bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
int level, req_level, ret;
-   gfn_t gfn = gpa >> PAGE_SHIFT;
+   gpa_t gpa = kpf->cr2_or_gpa;
+   gfn_t gfn = kpf->gfn;
gfn_t base_gfn = gfn;
 
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
return RET_PF_RETRY;
 
-   level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
+   level = kvm_mmu_hugepage_adjust(vcpu, gfn, kpf->max_level, &kpf->pfn,
huge_page_disallowed, &req_level);
 
-   trace_kvm_mmu_spte_requested(gpa, level, pfn);
+   trace_kvm_mmu_spte_requested(gpa, level, kpf->pfn);
for_each_shadow_entry(vcpu, gpa, it) {
/*
 * We cannot overwrite existing page tables with an NX
@@ -2884,7 +2883,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, 
u32 error_code,
 */
if (nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(*it.sptep, gfn, it.level,
-  &pfn, &level);
+  &kpf->pfn, &level);
 
base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == level)
@@ -2896,15 +2895,15 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t 
gpa, u32 error_code,
  it.level - 1, true, ACC_ALL);
 
link_shadow_page(vcpu, it.sptep, sp);
-   if (is_tdp && huge_page_disallowed &&
+   if (kpf->is_tdp && huge_page_disallowed &&
req_level >= it.level)
account_huge_nx_page(vcpu->kvm, sp);
}
}
 
ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
-  write, level, base_gfn, pfn, prefault,
-  map_writable);
+  kpf->write_fault, level, base_gfn, kpf->pfn, 
kpf->prefault,
+  kpf->map_writable);
if (ret == RET_PF_SPURIOUS)
return ret;
 
@@ -3697,7 +3696,6 @@ static int direct_page_fault(struct kvm_page_fault *kpf)
u32 error_code = kpf->error_code;
bool prefault = kpf->prefault;
int max_level = kpf->max_level;
-   bool is_tdp = kpf->is_tdp;
 
unsigned long mmu_seq;
int r;
@@ -3742,8 +3740,7 @@ static int direct_page_fault(struct kvm_page_fault *kpf)
r = kvm_tdp_mmu_map(vcpu, gpa, error_code, kpf->map_writable,
max_level, kpf->pfn, prefault);
else
-   r = __direct_map(vcpu, gpa, error_code, kpf->map_writable,
-max_level, kpf->pfn, prefault, is_tdp);
+   r = __direct_map(kpf);
 
 out_unlock:
if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
-- 
2.25.1

[RFC PATCH 07/10] KVM: x86/mmu: make fast_page_fault() receive single argument

2021-04-20 Thread Isaku Yamahata

Convert fast_page_fault() to receive single argument, struct kvm_page_fault
instead of many arguments.

No functional change is intended.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/kvm/mmu/mmu.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a16e1b228ac2..ce48416380c3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3043,9 +3043,11 @@ static bool is_access_allowed(u32 fault_err_code, u64 
spte)
 /*
  * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
  */
-static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
-  u32 error_code)
+static int fast_page_fault(struct kvm_page_fault *kpf)
 {
+   struct kvm_vcpu *vcpu = kpf->vcpu;
+   gpa_t cr2_or_gpa = kpf->cr2_or_gpa;
+   u32 error_code = kpf->error_code;
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
int ret = RET_PF_INVALID;
@@ -3704,7 +3706,7 @@ static int direct_page_fault(struct kvm_page_fault *kpf)
return RET_PF_EMULATE;
 
if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) {
-   r = fast_page_fault(vcpu, gpa, error_code);
+   r = fast_page_fault(kpf);
if (r != RET_PF_INVALID)
return r;
}
-- 
2.25.1

[RFC PATCH 06/10] KVM: x86/mmu: make handle_abnormal_pfn() receive single argument

2021-04-20 Thread Isaku Yamahata

Convert handle_abnormal_pfn() to receive single argument,
struct kvm_page_fault, instead of many arguments.

No functional change is intended.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/kvm/mmu/mmu.c | 14 --
 arch/x86/kvm/mmu/paging_tmpl.h |  2 +-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index dac022a79c57..a16e1b228ac2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2936,18 +2936,21 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, 
gfn_t gfn, kvm_pfn_t pfn)
return -EFAULT;
 }
 
-static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
-   kvm_pfn_t pfn, unsigned int access,
+static bool handle_abnormal_pfn(struct kvm_page_fault *kpf, unsigned int 
access,
int *ret_val)
 {
+   struct kvm_vcpu *vcpu = kpf->vcpu;
+   gva_t gva = kpf->is_tdp ? 0 : kpf->cr2_or_gpa;
+   kvm_pfn_t pfn = kpf->pfn;
+
/* The pfn is invalid, report the error! */
if (unlikely(is_error_pfn(pfn))) {
-   *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+   *ret_val = kvm_handle_bad_page(vcpu, kpf->gfn, pfn);
return true;
}
 
if (unlikely(is_noslot_pfn(pfn)))
-   vcpu_cache_mmio_info(vcpu, gva, gfn,
+   vcpu_cache_mmio_info(vcpu, gva, kpf->gfn,
 access & shadow_mmio_access_mask);
 
return false;
@@ -3694,7 +3697,6 @@ static int direct_page_fault(struct kvm_page_fault *kpf)
int max_level = kpf->max_level;
bool is_tdp = kpf->is_tdp;
 
-   gfn_t gfn = kpf->gfn;
unsigned long mmu_seq;
int r;
 
@@ -3717,7 +3719,7 @@ static int direct_page_fault(struct kvm_page_fault *kpf)
if (try_async_pf(kpf))
return RET_PF_RETRY;
 
-   if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, kpf->pfn, ACC_ALL, 
&r))
+   if (handle_abnormal_pfn(kpf, ACC_ALL, &r))
return r;
 
r = RET_PF_RETRY;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 7965786418af..7df68b5fdd10 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -853,7 +853,7 @@ static int FNAME(page_fault)(struct kvm_page_fault *kpf)
if (try_async_pf(kpf))
return RET_PF_RETRY;
 
-   if (handle_abnormal_pfn(vcpu, addr, walker.gfn, kpf->pfn, 
walker.pte_access, &r))
+   if (handle_abnormal_pfn(kpf, walker.pte_access, &r))
return r;
 
/*
-- 
2.25.1

[RFC PATCH 05/10] KVM: x86/mmu: make page_fault_handle_page_track() receive single argument

2021-04-20 Thread Isaku Yamahata

Convert page_fault_handle_page_trace() to receive single argument,
struct kvm_page_fault, instead of many arguments.

No functional change is intended.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/kvm/mmu/mmu.c | 9 +
 arch/x86/kvm/mmu/paging_tmpl.h | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a2422bd9f59b..dac022a79c57 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3598,9 +3598,10 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, 
u64 addr, bool direct)
return RET_PF_RETRY;
 }
 
-static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
-u32 error_code, gfn_t gfn)
+static bool page_fault_handle_page_track(struct kvm_page_fault *kpf)
 {
+   u32 error_code = kpf->error_code;
+
if (unlikely(error_code & PFERR_RSVD_MASK))
return false;
 
@@ -3612,7 +3613,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu 
*vcpu,
 * guest is writing the page which is write tracked which can
 * not be fixed by page fault handler.
 */
-   if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+   if (kvm_page_track_is_active(kpf->vcpu, kpf->gfn, KVM_PAGE_TRACK_WRITE))
return true;
 
return false;
@@ -3697,7 +3698,7 @@ static int direct_page_fault(struct kvm_page_fault *kpf)
unsigned long mmu_seq;
int r;
 
-   if (page_fault_handle_page_track(vcpu, error_code, gfn))
+   if (page_fault_handle_page_track(kpf))
return RET_PF_EMULATE;
 
if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) {
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index f2beb7f7c378..7965786418af 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -827,7 +827,8 @@ static int FNAME(page_fault)(struct kvm_page_fault *kpf)
return RET_PF_RETRY;
}
 
-   if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
+   kpf->gfn = walker.gfn;
+   if (page_fault_handle_page_track(kpf)) {
shadow_page_table_clear_flood(vcpu, addr);
return RET_PF_EMULATE;
}
@@ -849,7 +850,6 @@ static int FNAME(page_fault)(struct kvm_page_fault *kpf)
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
 
-   kpf->gfn = walker.gfn;
if (try_async_pf(kpf))
return RET_PF_RETRY;
 
-- 
2.25.1

[RFC PATCH 04/10] KVM: x86/mmu: make try_async_pf() receive single argument

2021-04-20 Thread Isaku Yamahata

Convert try_async_pf() to receive single struct kvm_page_fault instead of
many arguments.

No functional change is intended.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/kvm/mmu.h |  9 +++
 arch/x86/kvm/mmu/mmu.c | 45 +-
 arch/x86/kvm/mmu/paging_tmpl.h | 23 +
 3 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index fa3b1df502e7..b60fcad7279c 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -111,12 +111,17 @@ struct kvm_page_fault {
struct kvm_vcpu *vcpu;
gpa_t cr2_or_gpa;
u32 error_code;
+   bool write_fault;
bool prefault;
 
/* internal state */
gfn_t gfn;
bool is_tdp;
int max_level;
+
+   kvm_pfn_t pfn;
+   hva_t hva;
+   bool map_writable;
 };
 
 static inline void kvm_page_fault_init(
@@ -126,12 +131,16 @@ static inline void kvm_page_fault_init(
kpf->vcpu = vcpu;
kpf->cr2_or_gpa = cr2_or_gpa;
kpf->error_code = error_code;
+   kpf->write_fault = error_code & PFERR_WRITE_MASK;
kpf->prefault = prefault;
 
/* default value */
kpf->is_tdp = false;
kpf->gfn = cr2_or_gpa >> PAGE_SHIFT;
kpf->max_level = PG_LEVEL_4K;
+   kpf->pfn = KVM_PFN_NOSLOT;
+   kpf->hva = KVM_HVA_ERR_BAD;
+   kpf->map_writable = false;
 }
 
 int kvm_tdp_page_fault(struct kvm_page_fault *kpf);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index cb90148f90af..a2422bd9f59b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3646,27 +3646,29 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu 
*vcpu, gpa_t cr2_or_gpa,
  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
 
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
-bool write, bool *writable)
+static bool try_async_pf(struct kvm_page_fault *kpf)
 {
+   struct kvm_vcpu *vcpu = kpf->vcpu;
+   gfn_t gfn = kpf->gfn;
+   gpa_t cr2_or_gpa = kpf->cr2_or_gpa;
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
bool async;
 
/* Don't expose private memslots to L2. */
if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
-   *pfn = KVM_PFN_NOSLOT;
-   *writable = false;
+   kpf->pfn = KVM_PFN_NOSLOT;
+   kpf->map_writable = false;
return false;
}
 
async = false;
-   *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
-   write, writable, hva);
+   kpf->pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
+   kpf->write_fault, &kpf->map_writable,
+   &kpf->hva);
if (!async)
return false; /* *pfn has correct page already */
 
-   if (!prefault && kvm_can_do_async_pf(vcpu)) {
+   if (!kpf->prefault && kvm_can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
@@ -3676,8 +3678,9 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool 
prefault, gfn_t gfn,
return true;
}
 
-   *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
-   write, writable, hva);
+   kpf->pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
+   kpf->write_fault, &kpf->map_writable,
+   &kpf->hva);
return false;
 }
 
@@ -3689,13 +3692,9 @@ static int direct_page_fault(struct kvm_page_fault *kpf)
bool prefault = kpf->prefault;
int max_level = kpf->max_level;
bool is_tdp = kpf->is_tdp;
-   bool write = error_code & PFERR_WRITE_MASK;
-   bool map_writable;
 
gfn_t gfn = kpf->gfn;
unsigned long mmu_seq;
-   kvm_pfn_t pfn;
-   hva_t hva;
int r;
 
if (page_fault_handle_page_track(vcpu, error_code, gfn))
@@ -3714,11 +3713,10 @@ static int direct_page_fault(struct kvm_page_fault *kpf)
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
 
-   if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
-write, &map_writable))
+   if (try_async_pf(kpf))
return RET_PF_RETRY;
 
-   if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
+   if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, kpf->pfn, ACC_ALL, 
&r))
return

[RFC PATCH 02/10] KVM: x86/mmu: make kvm_mmu:page_fault receive single argument

2021-04-20 Thread Isaku Yamahata

Convert kvm_mmu:page_fault callback to receive struct kvm_page_fault
instead of many arguments.
The following functions are converted by this patch.
kvm_tdp_page_fault(), nonpaging_page_fault() and, FNAME(page_fault).

No functional change is intended.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/include/asm/kvm_host.h |  4 ++--
 arch/x86/kvm/mmu.h  |  9 +++--
 arch/x86/kvm/mmu/mmu.c  | 19 ++-
 arch/x86/kvm/mmu/paging_tmpl.h  |  7 +--
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3768819693e5..97e72076f358 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -351,6 +351,7 @@ struct kvm_mmu_root_info {
 #define KVM_HAVE_MMU_RWLOCK
 
 struct kvm_mmu_page;
+struct kvm_page_fault;
 
 /*
  * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
@@ -360,8 +361,7 @@ struct kvm_mmu_page;
 struct kvm_mmu {
unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu);
u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
-   int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err,
- bool prefault);
+   int (*page_fault)(struct kvm_page_fault *kpf);
void (*inject_page_fault)(struct kvm_vcpu *vcpu,
  struct x86_exception *fault);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 245c5d7fd3dd..7fcd9c147e63 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -124,18 +124,15 @@ static inline void kvm_page_fault_init(
kpf->prefault = prefault;
 }
 
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
-  bool prefault);
+int kvm_tdp_page_fault(struct kvm_page_fault *kpf);
 
 static inline int kvm_mmu_do_page_fault(struct kvm_page_fault *kpf)
 {
 #ifdef CONFIG_RETPOLINE
if (likely(kpf->vcpu->arch.mmu->page_fault == kvm_tdp_page_fault))
-   return kvm_tdp_page_fault(kpf->vcpu, kpf->cr2_or_gpa,
- kpf->error_code, kpf->prefault);
+   return kvm_tdp_page_fault(kpf);
 #endif
-   return kpf->vcpu->arch.mmu->page_fault(kpf->vcpu, kpf->cr2_or_gpa,
-  kpf->error_code, kpf->prefault);
+   return kpf->vcpu->arch.mmu->page_fault(kpf);
 }
 
 /*
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8ea2afcb528c..46998cfabfd3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3745,14 +3745,15 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, 
gpa_t gpa, u32 error_code,
return r;
 }
 
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
-   u32 error_code, bool prefault)
+static int nonpaging_page_fault(struct kvm_page_fault *kpf)
 {
-   pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
+   pgprintk("%s: gva %lx error %x\n", __func__,
+kpf->cr2_or_gpa, kpf->error_code);
 
/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
-   return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
-PG_LEVEL_2M, false);
+   return direct_page_fault(kpf->vcpu, kpf->cr2_or_gpa & PAGE_MASK,
+kpf->error_code,
+kpf->prefault, PG_LEVEL_2M, false);
 }
 
 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
@@ -3788,9 +3789,9 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 
error_code,
 }
 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
 
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
-  bool prefault)
+int kvm_tdp_page_fault(struct kvm_page_fault *kpf)
 {
+   u32 gpa = kpf->cr2_or_gpa;
int max_level;
 
for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
@@ -3799,11 +3800,11 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t 
gpa, u32 error_code,
int page_num = KVM_PAGES_PER_HPAGE(max_level);
gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
 
-   if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
+   if (kvm_mtrr_check_gfn_range_consistency(kpf->vcpu, base, 
page_num))
break;
}
 
-   return direct_page_fault(vcpu, gpa, error_code, prefault,
+   return direct_page_fault(kpf->vcpu, gpa, kpf->error_code, kpf->prefault,
 max_level, true);
 }
 
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 55d7b473ac44..dc814463a8df 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@

[RFC PATCH 03/10] KVM: x86/mmu: make direct_page_fault() receive single argument

2021-04-20 Thread Isaku Yamahata

Convert direct_page_fault() to receive struct kvm_page_fault instead of
many arguments.

No functional change is intended.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/kvm/mmu.h | 10 ++
 arch/x86/kvm/mmu/mmu.c | 32 
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7fcd9c147e63..fa3b1df502e7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -112,6 +112,11 @@ struct kvm_page_fault {
gpa_t cr2_or_gpa;
u32 error_code;
bool prefault;
+
+   /* internal state */
+   gfn_t gfn;
+   bool is_tdp;
+   int max_level;
 };
 
 static inline void kvm_page_fault_init(
@@ -122,6 +127,11 @@ static inline void kvm_page_fault_init(
kpf->cr2_or_gpa = cr2_or_gpa;
kpf->error_code = error_code;
kpf->prefault = prefault;
+
+   /* default value */
+   kpf->is_tdp = false;
+   kpf->gfn = cr2_or_gpa >> PAGE_SHIFT;
+   kpf->max_level = PG_LEVEL_4K;
 }
 
 int kvm_tdp_page_fault(struct kvm_page_fault *kpf);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 46998cfabfd3..cb90148f90af 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3681,13 +3681,18 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool 
prefault, gfn_t gfn,
return false;
 }
 
-static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
-bool prefault, int max_level, bool is_tdp)
-{
+static int direct_page_fault(struct kvm_page_fault *kpf)
+{
+   struct kvm_vcpu *vcpu = kpf->vcpu;
+   gpa_t gpa = kpf->cr2_or_gpa;
+   u32 error_code = kpf->error_code;
+   bool prefault = kpf->prefault;
+   int max_level = kpf->max_level;
+   bool is_tdp = kpf->is_tdp;
bool write = error_code & PFERR_WRITE_MASK;
bool map_writable;
 
-   gfn_t gfn = gpa >> PAGE_SHIFT;
+   gfn_t gfn = kpf->gfn;
unsigned long mmu_seq;
kvm_pfn_t pfn;
hva_t hva;
@@ -3750,10 +3755,12 @@ static int nonpaging_page_fault(struct kvm_page_fault 
*kpf)
pgprintk("%s: gva %lx error %x\n", __func__,
 kpf->cr2_or_gpa, kpf->error_code);
 
+   kpf->cr2_or_gpa &= PAGE_MASK;
+   kpf->is_tdp = false;
+   kpf->max_level = PG_LEVEL_2M;
+
/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
-   return direct_page_fault(kpf->vcpu, kpf->cr2_or_gpa & PAGE_MASK,
-kpf->error_code,
-kpf->prefault, PG_LEVEL_2M, false);
+   return direct_page_fault(kpf);
 }
 
 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
@@ -3791,21 +3798,22 @@ EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
 
 int kvm_tdp_page_fault(struct kvm_page_fault *kpf)
 {
-   u32 gpa = kpf->cr2_or_gpa;
+   struct kvm_vcpu *vcpu = kpf->vcpu;
int max_level;
+   kpf->is_tdp = true;
 
for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
 max_level > PG_LEVEL_4K;
 max_level--) {
int page_num = KVM_PAGES_PER_HPAGE(max_level);
-   gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
+   gfn_t base = kpf->gfn & ~(page_num - 1);
 
-   if (kvm_mtrr_check_gfn_range_consistency(kpf->vcpu, base, 
page_num))
+   if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
break;
}
+   kpf->max_level = max_level;
 
-   return direct_page_fault(kpf->vcpu, gpa, kpf->error_code, kpf->prefault,
-max_level, true);
+   return direct_page_fault(kpf);
 }
 
 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
-- 
2.25.1

[RFC PATCH 01/10] KVM: x86/mmu: make kvm_mmu_do_page_fault() receive single argument

2021-04-20 Thread Isaku Yamahata

Introduce struct kvm_page_fault handler and its initialization function.
Make the caller of kvm page fault handler allocate/initialize
struct kvm_page_fault, and pass it to kvm_mmu_do_page_fault() instead
of many arguments.

No functional change is intended.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/kvm/mmu.h | 29 -
 arch/x86/kvm/mmu/mmu.c |  6 --
 arch/x86/kvm/x86.c |  4 +++-
 3 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c68bfc3e2402..245c5d7fd3dd 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -106,17 +106,36 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
 vcpu->arch.mmu->shadow_root_level);
 }
 
+struct kvm_page_fault {
+   /* arguments to kvm page fault handler */
+   struct kvm_vcpu *vcpu;
+   gpa_t cr2_or_gpa;
+   u32 error_code;
+   bool prefault;
+};
+
+static inline void kvm_page_fault_init(
+   struct kvm_page_fault *kpf, struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+   u32 error_code, bool prefault)
+{
+   kpf->vcpu = vcpu;
+   kpf->cr2_or_gpa = cr2_or_gpa;
+   kpf->error_code = error_code;
+   kpf->prefault = prefault;
+}
+
 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
   bool prefault);
 
-static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t 
cr2_or_gpa,
-   u32 err, bool prefault)
+static inline int kvm_mmu_do_page_fault(struct kvm_page_fault *kpf)
 {
 #ifdef CONFIG_RETPOLINE
-   if (likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault))
-   return kvm_tdp_page_fault(vcpu, cr2_or_gpa, err, prefault);
+   if (likely(kpf->vcpu->arch.mmu->page_fault == kvm_tdp_page_fault))
+   return kvm_tdp_page_fault(kpf->vcpu, kpf->cr2_or_gpa,
+ kpf->error_code, kpf->prefault);
 #endif
-   return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault);
+   return kpf->vcpu->arch.mmu->page_fault(kpf->vcpu, kpf->cr2_or_gpa,
+  kpf->error_code, kpf->prefault);
 }
 
 /*
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 951dae4e7175..8ea2afcb528c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5006,6 +5006,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t 
cr2_or_gpa, u64 error_code,
 {
int r, emulation_type = EMULTYPE_PF;
bool direct = vcpu->arch.mmu->direct_map;
+   struct kvm_page_fault kpf;
 
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
return RET_PF_RETRY;
@@ -5018,8 +5019,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t 
cr2_or_gpa, u64 error_code,
}
 
if (r == RET_PF_INVALID) {
-   r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
- lower_32_bits(error_code), false);
+   kvm_page_fault_init(&kpf, vcpu, cr2_or_gpa,
+   lower_32_bits(error_code), false);
+   r = kvm_mmu_do_page_fault(&kpf);
if (WARN_ON_ONCE(r == RET_PF_INVALID))
return -EIO;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eca63625aee4..999ed561de64 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11083,6 +11083,7 @@ EXPORT_SYMBOL_GPL(kvm_set_rflags);
 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf 
*work)
 {
int r;
+   struct kvm_page_fault kpf;
 
if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
  work->wakeup_all)
@@ -11096,7 +11097,8 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, 
struct kvm_async_pf *work)
  work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
return;
 
-   kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
+   kvm_page_fault_init(&kpf, vcpu, work->cr2_or_gpa, 0, true);
+   kvm_mmu_do_page_fault(&kpf);
 }
 
 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
-- 
2.25.1

[RFC PATCH 00/10] KVM: x86/mmu: simplify argument to kvm page fault handler

2021-04-20 Thread Isaku Yamahata

This is a preliminary clean up for TDX which complicates KVM page fault
execution path. simplify those execution path as preparation.

The current kvm page fault handlers passes around many arguments to the
functions.
To simplify those arguments and local variables, introduce data structure,
struct kvm_page_fault,  to hold those arguments, variables, and passes around 
the pointer
to struct kvm_page_fault.

struct kvm_page_fault is allocated on stack on the caller of kvm fault handler,
kvm_mmu_do_page_fault().
And then  push down the pointer to inner functions step by step.
The conversion order is as follows
. kvm_mmu_do_page_fault() with introducing struct kvm_page_fault
. kvm_mmu.page_fault(): kvm_tdp_page_fault(), nonpaging_page_fault(), 
FNAME(page_fault)
. direct_page_fault()
. try_async_pf()
. page_fault_handle_page_track()
. handle_abnormal_pfn()
. fast_page_fault()
. __direct_map()
. kvm_tdp_mmu_map()
. FNAME(fetch)

Probably more functions should be converted. or some should not converted.
Only code refactoring and no functional change is intended.

Isaku Yamahata (10):
  KVM: x86/mmu: make kvm_mmu_do_page_fault() receive single argument
  KVM: x86/mmu: make kvm_mmu:page_fault receive single argument
  KVM: x86/mmu: make direct_page_fault() receive single argument
  KVM: x86/mmu: make try_async_pf() receive single argument
  KVM: x86/mmu: make page_fault_handle_page_track() receive single
argument
  KVM: x86/mmu: make handle_abnormal_pfn() receive single argument
  KVM: x86/mmu: make fast_page_fault() receive single argument
  KVM: x86/mmu: make __direct_map() receive single argument
  KVM: x86/mmu: make kvm_tdp_mmu_map() receive single argument
  KVM: x86/mmu: make FNAME(fetch) receive single argument

 arch/x86/include/asm/kvm_host.h |   4 +-
 arch/x86/kvm/mmu.h  |  49 ++--
 arch/x86/kvm/mmu/mmu.c  | 130 +---
 arch/x86/kvm/mmu/paging_tmpl.h  |  60 +++
 arch/x86/kvm/mmu/tdp_mmu.c  |  21 +++---
 arch/x86/kvm/mmu/tdp_mmu.h  |   4 +-
 arch/x86/kvm/x86.c  |   4 +-
 7 files changed, 156 insertions(+), 116 deletions(-)

-- 
2.25.1

[tip: x86/urgent] x86/mem_encrypt: Correct physical address calculation in __set_clr_pte_enc()

2021-03-23 Thread tip-bot2 for Isaku Yamahata

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 8249d17d3194eac064a8ca5bc5ca0abc86feecde
Gitweb:
https://git.kernel.org/tip/8249d17d3194eac064a8ca5bc5ca0abc86feecde
Author:Isaku Yamahata 
AuthorDate:Thu, 18 Mar 2021 13:26:57 -07:00
Committer: Borislav Petkov 
CommitterDate: Tue, 23 Mar 2021 11:59:45 +01:00

x86/mem_encrypt: Correct physical address calculation in __set_clr_pte_enc()

The pfn variable contains the page frame number as returned by the
pXX_pfn() functions, shifted to the right by PAGE_SHIFT to remove the
page bits. After page protection computations are done to it, it gets
shifted back to the physical address using page_level_shift().

That is wrong, of course, because that function determines the shift
length based on the level of the page in the page table but in all the
cases, it was shifted by PAGE_SHIFT before.

Therefore, shift it back using PAGE_SHIFT to get the correct physical
address.

 [ bp: Rewrite commit message. ]

Fixes: dfaaec9033b8 ("x86: Add support for changing memory encryption attribute 
in early boot")
Signed-off-by: Isaku Yamahata 
Signed-off-by: Borislav Petkov 
Reviewed-by: Kirill A. Shutemov 
Reviewed-by: Tom Lendacky 
Cc: 
Link: 
https://lkml.kernel.org/r/81abbae1657053eccc535c16151f63cd049dcb97.1616098294.git.isaku.yamah...@intel.com
---
 arch/x86/mm/mem_encrypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 4b01f7d..ae78cef 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -262,7 +262,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int 
level, bool enc)
if (pgprot_val(old_prot) == pgprot_val(new_prot))
return;
 
-   pa = pfn << page_level_shift(level);
+   pa = pfn << PAGE_SHIFT;
size = page_level_size(level);
 
/*

[PATCH] X86: __set_clr_pte_enc() miscalculates physical address

2021-03-18 Thread Isaku Yamahata

__set_clr_pte_enc() miscalculates physical address to operate.
pfn is in unit of PG_LEVEL_4K, not PGL_LEVEL_{2M, 1G}.
Shift size to get physical address should be PAGE_SHIFT,
not page_level_shift().

Fixes: dfaaec9033b8 ("x86: Add support for changing memory encryption attribute 
in early boot")
Reviewed-by: Kirill A. Shutemov 
Signed-off-by: Isaku Yamahata 
---
 arch/x86/mm/mem_encrypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 4b01f7dbaf30..ae78cef79980 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -262,7 +262,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int 
level, bool enc)
if (pgprot_val(old_prot) == pgprot_val(new_prot))
return;
 
-   pa = pfn << page_level_shift(level);
+   pa = pfn << PAGE_SHIFT;
size = page_level_size(level);
 
/*
-- 
2.25.1

[RFC PATCH 22/67] KVM: Add per-VM flag to mark read-only memory as unsupported

2020-11-16 Thread isaku . yamahata

From: Isaku Yamahata 

Add a flag for TDX to flag RO memory as unsupported and propagate it to
KVM_MEM_READONLY to allow reporting RO memory as unsupported on a per-VM
basis.  TDX1 doesn't expose permission bits to the VMM in the SEPT
tables, i.e. doesn't support read-only private memory.

Signed-off-by: Isaku Yamahata 
Co-developed-by: Sean Christopherson 
Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/x86.c   | 4 +++-
 include/linux/kvm_host.h | 4 
 virt/kvm/kvm_main.c  | 8 +---
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 01380f057d9f..4060f3d91f74 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3695,7 +3695,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
case KVM_CAP_ASYNC_PF_INT:
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_KVMCLOCK_CTRL:
-   case KVM_CAP_READONLY_MEM:
case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
case KVM_CAP_TSC_DEADLINE_TIMER:
@@ -3785,6 +3784,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
if (kvm_x86_ops.is_vm_type_supported(KVM_X86_TDX_VM))
r |= BIT(KVM_X86_TDX_VM);
break;
+   case KVM_CAP_READONLY_MEM:
+   r = kvm && kvm->readonly_mem_unsupported ? 0 : 1;
+   break;
default:
break;
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 95371750c23f..1a0df7b83fd0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -517,6 +517,10 @@ struct kvm {
pid_t userspace_pid;
unsigned int max_halt_poll_ns;
 
+#ifdef __KVM_HAVE_READONLY_MEM
+   bool readonly_mem_unsupported;
+#endif
+
bool vm_bugged;
 };
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3dc41b6e12a0..572a66a61c29 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1100,12 +1100,14 @@ static void update_memslots(struct kvm_memslots *slots,
}
 }
 
-static int check_memory_region_flags(const struct kvm_userspace_memory_region 
*mem)
+static int check_memory_region_flags(struct kvm *kvm,
+const struct kvm_userspace_memory_region 
*mem)
 {
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
 
 #ifdef __KVM_HAVE_READONLY_MEM
-   valid_flags |= KVM_MEM_READONLY;
+   if (!kvm->readonly_mem_unsupported)
+   valid_flags |= KVM_MEM_READONLY;
 #endif
 
if (mem->flags & ~valid_flags)
@@ -1278,7 +1280,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
int as_id, id;
int r;
 
-   r = check_memory_region_flags(mem);
+   r = check_memory_region_flags(kvm, mem);
if (r)
return r;
 
-- 
2.17.1

[RFC PATCH 17/67] KVM: x86: Introduce "protected guest" concept and block disallowed ioctls

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add 'guest_state_protected' to mark a VM's state as being protected by
hardware/firmware, e.g. SEV-ES or TDX-SEAM.  Use the flag to disallow
ioctls() and/or flows that attempt to access protected state.

Return an error if userspace attempts to get/set register state for a
protected VM, e.g. a non-debug TDX guest.  KVM can't provide sane data,
it's userspace's responsibility to avoid attempting to read guest state
when it's known to be inaccessible.

Retrieving vCPU events is the one exception, as the userspace VMM is
allowed to inject NMIs.

Co-developed-by: Xiaoyao Li 
Signed-off-by: Xiaoyao Li 
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/kvm/x86.c  | 113 +++-
 2 files changed, 97 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1ff33efd6394..e687a8bd46ad 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -994,6 +994,8 @@ struct kvm_arch {
struct msr_bitmap_range ranges[16];
} msr_filter;
 
+   bool guest_state_protected;
+
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1fa6a042984b..6154abecd546 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3966,7 +3966,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
int idx;
 
-   if (vcpu->preempted)
+   if (vcpu->preempted && !vcpu->kvm->arch.guest_state_protected)
vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
 
/*
@@ -4074,6 +4074,9 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
 
 static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
 {
+   if (vcpu->kvm->arch.guest_state_protected)
+   return -EINVAL;
+
kvm_make_request(KVM_REQ_SMI, vcpu);
 
return 0;
@@ -4120,6 +4123,9 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu 
*vcpu,
unsigned bank_num = mcg_cap & 0xff;
u64 *banks = vcpu->arch.mce_banks;
 
+   if (vcpu->kvm->arch.guest_state_protected)
+   return -EINVAL;
+
if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
return -EINVAL;
/*
@@ -4212,7 +4218,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct 
kvm_vcpu *vcpu,
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
events->interrupt.soft = 0;
-   events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
+   if (!vcpu->kvm->arch.guest_state_protected)
+   events->interrupt.shadow = 
kvm_x86_ops.get_interrupt_shadow(vcpu);
 
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending != 0;
@@ -4241,11 +4248,16 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu);
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
  struct kvm_vcpu_events *events)
 {
-   if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SIPI_VECTOR
- | KVM_VCPUEVENT_VALID_SHADOW
- | KVM_VCPUEVENT_VALID_SMM
- | KVM_VCPUEVENT_VALID_PAYLOAD))
+   u32 allowed_flags = KVM_VCPUEVENT_VALID_NMI_PENDING |
+   KVM_VCPUEVENT_VALID_SIPI_VECTOR |
+   KVM_VCPUEVENT_VALID_SHADOW |
+   KVM_VCPUEVENT_VALID_SMM |
+   KVM_VCPUEVENT_VALID_PAYLOAD;
+
+   if (vcpu->kvm->arch.guest_state_protected)
+   allowed_flags = KVM_VCPUEVENT_VALID_NMI_PENDING;
+
+   if (events->flags & ~allowed_flags)
return -EINVAL;
 
if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
@@ -4326,17 +4338,22 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct 
kvm_vcpu *vcpu,
return 0;
 }
 
-static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
-struct kvm_debugregs *dbgregs)
+static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+   struct kvm_debugregs *dbgregs)
 {
unsigned long val;
 
+   if (vcpu->kvm->arch.guest_state_protected)
+   return -EINVAL;
+
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
kvm_get_dr(vcpu, 6, &val);
dbgregs->dr6 = val;
dbgregs->dr7 = vcpu->arch.dr7;
dbgregs->flags = 0;
memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
+
+   return 0;
 }
 
 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -4350,6 +4367,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct 
kvm

[RFC PATCH 30/67] KVM: x86: Check for pending APICv interrupt in kvm_vcpu_has_events()

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Return true for kvm_vcpu_has_events() if the vCPU has a pending APICv
interrupt to support TDX's usage of APICv.  Unlike VMX, TDX doesn't have
access to vmcs.GUEST_INTR_STATUS and so can't emulate posted interrupts,
i.e. needs to generate a posted interrupt and more importantly can't
manually move requested interrupts into the vIRR (which it also doesn't
have access to).

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/x86.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 09edc3ad..c233e7ef3366 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10813,7 +10813,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu 
*vcpu)
 
if (kvm_arch_interrupt_allowed(vcpu) &&
(kvm_cpu_has_interrupt(vcpu) ||
-   kvm_guest_apic_has_interrupt(vcpu)))
+kvm_guest_apic_has_interrupt(vcpu) ||
+(vcpu->arch.apicv_active &&
+ kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu
return true;
 
if (kvm_hv_has_stimer_pending(vcpu))
-- 
2.17.1

[RFC PATCH 04/67] KVM: Export kvm_io_bus_read for use by TDX for PV MMIO

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 virt/kvm/kvm_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2541a17ff1c4..65e1737c4354 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4288,6 +4288,7 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus 
bus_idx, gpa_t addr,
r = __kvm_io_bus_read(vcpu, bus, &range, val);
return r < 0 ? r : 0;
 }
+EXPORT_SYMBOL_GPL(kvm_io_bus_read);
 
 /* Caller must hold slots_lock. */
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-- 
2.17.1

[RFC PATCH 06/67] KVM: x86: Split core of hypercall emulation to helper function

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

By necessity, TDX will use a different register ABI for hypercalls.
Break out the core functionality so that it may be reused for TDX.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  4 +++
 arch/x86/kvm/x86.c  | 49 +
 2 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d44858b69353..c2639744ea09 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1549,6 +1549,10 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
 void kvm_request_apicv_update(struct kvm *kvm, bool activate,
  unsigned long bit);
 
+unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ int op_64_bit);
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f5ede41bf9e6..0f67f762717a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8020,23 +8020,15 @@ static void kvm_sched_yield(struct kvm *kvm, unsigned 
long dest_id)
kvm_vcpu_yield_to(target);
 }
 
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ int op_64_bit)
 {
-   unsigned long nr, a0, a1, a2, a3, ret;
-   int op_64_bit;
-
-   if (kvm_hv_hypercall_enabled(vcpu->kvm))
-   return kvm_hv_hypercall(vcpu);
-
-   nr = kvm_rax_read(vcpu);
-   a0 = kvm_rbx_read(vcpu);
-   a1 = kvm_rcx_read(vcpu);
-   a2 = kvm_rdx_read(vcpu);
-   a3 = kvm_rsi_read(vcpu);
+   unsigned long ret;
 
trace_kvm_hypercall(nr, a0, a1, a2, a3);
 
-   op_64_bit = is_64_bit_mode(vcpu);
if (!op_64_bit) {
nr &= 0x;
a0 &= 0x;
@@ -8045,11 +8037,6 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 &= 0x;
}
 
-   if (kvm_x86_ops.get_cpl(vcpu) != 0) {
-   ret = -KVM_EPERM;
-   goto out;
-   }
-
ret = -KVM_ENOSYS;
 
switch (nr) {
@@ -8086,6 +8073,32 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
ret = -KVM_ENOSYS;
break;
}
+   return ret;
+}
+EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+   unsigned long nr, a0, a1, a2, a3, ret;
+   int op_64_bit;
+
+   if (kvm_hv_hypercall_enabled(vcpu->kvm))
+   return kvm_hv_hypercall(vcpu);
+
+   op_64_bit = is_64_bit_mode(vcpu);
+
+   if (kvm_x86_ops.get_cpl(vcpu) != 0) {
+   ret = -KVM_EPERM;
+   goto out;
+   }
+
+   nr = kvm_rax_read(vcpu);
+   a0 = kvm_rbx_read(vcpu);
+   a1 = kvm_rcx_read(vcpu);
+   a2 = kvm_rdx_read(vcpu);
+   a3 = kvm_rsi_read(vcpu);
+
+   ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit);
 out:
if (!op_64_bit)
ret = (u32)ret;
-- 
2.17.1

[RFC PATCH 25/67] KVM: x86: Allow host-initiated WRMSR to set X2APIC regardless of CPUID

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Let userspace, or in the case of TDX, KVM itself, enable X2APIC even if
X2APIC is not reported as supported in the guest's CPU model.  KVM
generally does not force specific ordering between ioctls(), e.g. this
forces userspace to configure CPUID before MSRs.  And for TDX, vCPUs
will always run with X2APIC enabled, e.g. KVM will want/need to enable
X2APIC from time zero.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/x86.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8d58141256c5..a1c57d1eb460 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -394,8 +394,11 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
 {
enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
-   u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
-   (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
+   u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff;
+
+   if (!msr_info->host_initiated &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
+   reserved_bits |= X2APIC_ENABLE;
 
if ((msr_info->data & reserved_bits) != 0 || new_mode == 
LAPIC_MODE_INVALID)
return 1;
-- 
2.17.1

[RFC PATCH 10/67] KVM: Export kvm_make_all_cpus_request() for use in marking VMs as bugged

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Export kvm_make_all_cpus_request() and hoist the request helper
declarations of request up to the KVM_REQ_* definitions in preparation
for adding a "VM bugged" framework.  The framework will add KVM_BUG()
and KVM_BUG_ON() as alternatives to full BUG()/BUG_ON() for cases where
KVM has definitely hit a bug (in itself or in silicon) and the VM is all
but guaranteed to be hosed.  Marking a VM bugged will trigger a request
to all vCPUs to allow arch code to forcefully evict each vCPU from its
run loop.

Signed-off-by: Sean Christopherson 
---
 include/linux/kvm_host.h | 18 +-
 virt/kvm/kvm_main.c  |  1 +
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 03c016ff1715..ad9b6963d19d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -155,6 +155,15 @@ static inline bool is_error_page(struct page *page)
 })
 #define KVM_ARCH_REQ(nr)   KVM_ARCH_REQ_FLAGS(nr, 0)
 
+bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
+struct kvm_vcpu *except,
+unsigned long *vcpu_bitmap, cpumask_var_t tmp);
+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
+bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
+ struct kvm_vcpu *except);
+bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req,
+   unsigned long *vcpu_bitmap);
+
 #define KVM_USERSPACE_IRQ_SOURCE_ID0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID   1
 
@@ -874,15 +883,6 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache 
*mc);
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 #endif
 
-bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
-struct kvm_vcpu *except,
-unsigned long *vcpu_bitmap, cpumask_var_t tmp);
-bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
-bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
- struct kvm_vcpu *except);
-bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req,
-   unsigned long *vcpu_bitmap);
-
 long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
 long kvm_arch_vcpu_ioctl(struct file *filp,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 21af4f083674..b29b6c3484dd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -304,6 +304,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned 
int req)
 {
return kvm_make_all_cpus_request_except(kvm, req, NULL);
 }
+EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
 
 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
 void kvm_flush_remote_tlbs(struct kvm *kvm)
-- 
2.17.1

[RFC PATCH 09/67] KVM: Add infrastructure and macro to mark VM as bugged

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 include/linux/kvm_host.h | 27 +++
 virt/kvm/kvm_main.c  | 10 +-
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7f2e2a09ebbd..03c016ff1715 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -146,6 +146,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_MMU_RELOAD(1 | KVM_REQUEST_WAIT | 
KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_PENDING_TIMER 2
 #define KVM_REQ_UNHALT3
+#define KVM_REQ_VM_BUGGED(4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQUEST_ARCH_BASE 8
 
 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
@@ -505,6 +506,8 @@ struct kvm {
struct srcu_struct irq_srcu;
pid_t userspace_pid;
unsigned int max_halt_poll_ns;
+
+   bool vm_bugged;
 };
 
 #define kvm_err(fmt, ...) \
@@ -533,6 +536,30 @@ struct kvm {
 #define vcpu_err(vcpu, fmt, ...)   \
kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
 
+static inline void kvm_vm_bugged(struct kvm *kvm)
+{
+   kvm->vm_bugged = true;
+   kvm_make_all_cpus_request(kvm, KVM_REQ_VM_BUGGED);
+}
+
+#define KVM_BUG(cond, kvm, fmt...) \
+({ \
+   int __ret = (cond); \
+   \
+   if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt)) \
+   kvm_vm_bugged(kvm); \
+   unlikely(__ret);\
+})
+
+#define KVM_BUG_ON(cond, kvm)  \
+({ \
+   int __ret = (cond); \
+   \
+   if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged))   \
+   kvm_vm_bugged(kvm); \
+   unlikely(__ret);\
+})
+
 static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm)
 {
return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 11166e901582..21af4f083674 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3194,7 +3194,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
struct kvm_fpu *fpu = NULL;
struct kvm_sregs *kvm_sregs = NULL;
 
-   if (vcpu->kvm->mm != current->mm)
+   if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
return -EIO;
 
if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
@@ -3400,7 +3400,7 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
void __user *argp = compat_ptr(arg);
int r;
 
-   if (vcpu->kvm->mm != current->mm)
+   if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
return -EIO;
 
switch (ioctl) {
@@ -3466,7 +3466,7 @@ static long kvm_device_ioctl(struct file *filp, unsigned 
int ioctl,
 {
struct kvm_device *dev = filp->private_data;
 
-   if (dev->kvm->mm != current->mm)
+   if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged)
return -EIO;
 
switch (ioctl) {
@@ -3682,7 +3682,7 @@ static long kvm_vm_ioctl(struct file *filp,
void __user *argp = (void __user *)arg;
int r;
 
-   if (kvm->mm != current->mm)
+   if (kvm->mm != current->mm || kvm->vm_bugged)
return -EIO;
switch (ioctl) {
case KVM_CREATE_VCPU:
@@ -3877,7 +3877,7 @@ static long kvm_vm_compat_ioctl(struct file *filp,
struct kvm *kvm = filp->private_data;
int r;
 
-   if (kvm->mm != current->mm)
+   if (kvm->mm != current->mm || kvm->vm_bugged)
return -EIO;
switch (ioctl) {
case KVM_GET_DIRTY_LOG: {
-- 
2.17.1

[RFC PATCH 15/67] KVM: x86: Add vm_type to differentiate legacy VMs from protected VMs

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add a capability to effectively allow userspace to query what VM types
are supported by KVM.

Co-developed-by: Xiaoyao Li 
Signed-off-by: Xiaoyao Li 
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h   | 2 ++
 arch/x86/include/uapi/asm/kvm.h   | 4 
 arch/x86/kvm/svm/svm.c| 6 ++
 arch/x86/kvm/vmx/vmx.c| 6 ++
 arch/x86/kvm/x86.c| 9 -
 include/uapi/linux/kvm.h  | 2 ++
 tools/arch/x86/include/uapi/asm/kvm.h | 4 
 tools/include/uapi/linux/kvm.h| 2 ++
 8 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c2639744ea09..1ff33efd6394 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -897,6 +897,7 @@ enum kvm_irqchip_mode {
 #define APICV_INHIBIT_REASON_X2APIC5
 
 struct kvm_arch {
+   unsigned long vm_type;
unsigned long n_used_mmu_pages;
unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages;
@@ -1090,6 +1091,7 @@ struct kvm_x86_ops {
bool (*has_emulated_msr)(u32 index);
void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
 
+   bool (*is_vm_type_supported)(unsigned long vm_type);
unsigned int vm_size;
int (*vm_init)(struct kvm *kvm);
void (*vm_destroy)(struct kvm *kvm);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 89e5f3d1bba8..29cdf262e516 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -486,4 +486,8 @@ struct kvm_pmu_event_filter {
 #define KVM_PMU_EVENT_ALLOW 0
 #define KVM_PMU_EVENT_DENY 1
 
+#define KVM_X86_LEGACY_VM  0
+#define KVM_X86_SEV_ES_VM  1
+#define KVM_X86_TDX_VM 2
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e001e3c9e4bc..11ab330a9b55 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4161,6 +4161,11 @@ static void svm_vm_destroy(struct kvm *kvm)
sev_vm_destroy(kvm);
 }
 
+static bool svm_is_vm_type_supported(unsigned long type)
+{
+   return type == KVM_X86_LEGACY_VM;
+}
+
 static int svm_vm_init(struct kvm *kvm)
 {
if (!pause_filter_count || !pause_filter_thresh)
@@ -4187,6 +4192,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_free = svm_free_vcpu,
.vcpu_reset = svm_vcpu_reset,
 
+   .is_vm_type_supported = svm_is_vm_type_supported,
.vm_size = sizeof(struct kvm_svm),
.vm_init = svm_vm_init,
.vm_destroy = svm_vm_destroy,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0703d82e7bad..b3ecdb96789a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6966,6 +6966,11 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
return err;
 }
 
+static bool vmx_is_vm_type_supported(unsigned long type)
+{
+   return type == KVM_X86_LEGACY_VM;
+}
+
 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See 
CVE-2018-3646 and 
https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for 
details.\n"
 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation 
disabled, data leak possible. See CVE-2018-3646 and 
https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for 
details.\n"
 
@@ -7603,6 +7608,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cpu_has_accelerated_tpr = report_flexpriority,
.has_emulated_msr = vmx_has_emulated_msr,
 
+   .is_vm_type_supported = vmx_is_vm_type_supported,
.vm_size = sizeof(struct kvm_vmx),
.vm_init = vmx_vm_init,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 19b53aedc6c8..346394d83672 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3771,6 +3771,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
case KVM_CAP_STEAL_TIME:
r = sched_info_on();
break;
+   case KVM_CAP_VM_TYPES:
+   r = BIT(KVM_X86_LEGACY_VM);
+   if (kvm_x86_ops.is_vm_type_supported(KVM_X86_TDX_VM))
+   r |= BIT(KVM_X86_TDX_VM);
+   break;
default:
break;
}
@@ -10249,9 +10254,11 @@ void kvm_arch_free_vm(struct kvm *kvm)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
-   if (type)
+   if (!kvm_x86_ops.is_vm_type_supported(type))
return -EINVAL;
 
+   kvm->arch.vm_type = type;
+
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ca41220b40b8..c603e9a004f1 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1054,6 +1054,8 @@ struct kvm_ppc_resize_hpt {

[RFC PATCH 13/67] KVM: VMX: Explicitly check for hv_remote_flush_tlb when loading pgd()

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Explicitly check that kvm_x86_ops.tlb_remote_flush() points at Hyper-V's
implementation for PV flushing instead of assuming that a non-NULL
implemenation means running on Hyper-V.  Wrap the related logic in
ifdeffery as hv_remote_flush_tlb() is defined iff CONFIG_HYPERV!=n.

Short term, the explicit check makes it more obvious why a non-NULL
tlb_remote_flush() triggers EPTP shenanigans.  Long term, this will
allow TDX to define its own implementation of tlb_remote_flush() without
running afoul of Hyper-V.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/vmx.c | 7 +--
 arch/x86/kvm/vmx/vmx.h | 2 ++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1c9ad3103c87..0703d82e7bad 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3072,14 +3072,15 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, 
unsigned long pgd,
eptp = construct_eptp(vcpu, pgd, pgd_level);
vmcs_write64(EPT_POINTER, eptp);
 
-   if (kvm_x86_ops.tlb_remote_flush) {
+#if IS_ENABLED(CONFIG_HYPERV)
+   if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
to_vmx(vcpu)->ept_pointer = eptp;
to_kvm_vmx(kvm)->ept_pointers_match
= EPT_POINTERS_CHECK;
spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
}
-
+#endif
if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
else if (test_bit(VCPU_EXREG_CR3, (ulong 
*)&vcpu->arch.regs_avail))
@@ -6970,7 +6971,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 
 static int vmx_vm_init(struct kvm *kvm)
 {
+#if IS_ENABLED(CONFIG_HYPERV)
spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+#endif
 
if (!ple_gap)
kvm->arch.pause_in_guest = true;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index f6f66e5c6510..e9cdb0fb7f56 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -301,8 +301,10 @@ struct kvm_vmx {
bool ept_identity_pagetable_done;
gpa_t ept_identity_map_addr;
 
+#if IS_ENABLED(CONFIG_HYPERV)
enum ept_pointers_status ept_pointers_match;
spinlock_t ept_pointer_lock;
+#endif
 };
 
 bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
-- 
2.17.1

[RFC PATCH 20/67] KVM: x86: Make KVM_CAP_X86_SMM a per-VM capability

2020-11-16 Thread isaku . yamahata

From: Isaku Yamahata 

TDX doesn't support SMM, whereas VMX conditionally support SMM.  Rework
kvm_x86_ops.has_emulated_msr() to take a struct kvm so that TDX can
reject SMM by way of the MSR_IA32_SMBASE check.

This pair with a QEMU change to query SMM support using a VM ioctl().

Signed-off-by: Isaku Yamahata 
Co-developed-by: Sean Christopherson 
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/svm/svm.c  | 2 +-
 arch/x86/kvm/vmx/vmx.c  | 2 +-
 arch/x86/kvm/x86.c  | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 70528102d865..00b34d8f038b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1092,7 +1092,7 @@ struct kvm_x86_ops {
void (*hardware_disable)(void);
void (*hardware_unsetup)(void);
bool (*cpu_has_accelerated_tpr)(void);
-   bool (*has_emulated_msr)(u32 index);
+   bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
 
bool (*is_vm_type_supported)(unsigned long vm_type);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 11ab330a9b55..241a26e1fa71 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3720,7 +3720,7 @@ static bool svm_cpu_has_accelerated_tpr(void)
return false;
 }
 
-static bool svm_has_emulated_msr(u32 index)
+static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
 {
switch (index) {
case MSR_IA32_MCG_EXT_CTL:
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b3ecdb96789a..2ee7eb7dac26 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6405,7 +6405,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
handle_exception_nmi_irqoff(vmx);
 }
 
-static bool vmx_has_emulated_msr(u32 index)
+static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
 {
switch (index) {
case MSR_IA32_SMBASE:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2fb0d20c5788..2f4b226d5b89 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3726,7 +3726,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
 * fringe case that is not enabled except via specific settings
 * of the module parameters.
 */
-   r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
+   r = kvm_x86_ops.has_emulated_msr(kvm, MSR_IA32_SMBASE);
break;
case KVM_CAP_VAPIC:
r = !kvm_x86_ops.cpu_has_accelerated_tpr();
@@ -5783,7 +5783,7 @@ static void kvm_init_msr_list(void)
}
 
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
-   if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i]))
+   if (!kvm_x86_ops.has_emulated_msr(NULL, emulated_msrs_all[i]))
continue;
 
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
-- 
2.17.1

[RFC PATCH 12/67] KVM: x86/mmu: Mark VM as bugged if page fault returns RET_PF_INVALID

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3c7e43e12513..bebd2b6ebcad 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5059,7 +5059,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t 
cr2_or_gpa, u64 error_code,
if (r == RET_PF_INVALID) {
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
  lower_32_bits(error_code), false);
-   if (WARN_ON_ONCE(r == RET_PF_INVALID))
+   if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
}
 
-- 
2.17.1

[RFC PATCH 16/67] KVM: x86: Hoist kvm_dirty_regs check out of sync_regs()

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Move the kvm_dirty_regs vs. KVM_SYNC_X86_VALID_FIELDS check out of
sync_regs() and into its sole caller, kvm_arch_vcpu_ioctl_run().  This
allows a future patch to allow synchronizing select state for protected
VMs.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/x86.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 346394d83672..1fa6a042984b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9257,7 +9257,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
 
-   if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+   if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
+   (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
r = -EINVAL;
goto out;
}
@@ -9778,9 +9779,6 @@ static void store_regs(struct kvm_vcpu *vcpu)
 
 static int sync_regs(struct kvm_vcpu *vcpu)
 {
-   if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
-   return -EINVAL;
-
if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
__set_regs(vcpu, &vcpu->run->s.regs.regs);
vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
-- 
2.17.1

[RFC PATCH 24/67] KVM: x86: Add per-VM flag to disable in-kernel I/O APIC and level routes

2020-11-16 Thread isaku . yamahata

From: Kai Huang 

Add a flag to let TDX disallow the in-kernel I/O APIC, level triggered
routes for a userspace I/O APIC, and anything else that relies on being
able to intercept EOIs.  TDX-SEAM does not allow intercepting EOI.

Note, technically KVM could partially emulate the I/O APIC by allowing
only edge triggered interrupts, but that adds a lot of complexity for
basically zero benefit.  Ideally KVM wouldn't even allow I/O APIC route
reservation, but disabling that is a train wreck for Qemu.

Signed-off-by: Kai Huang 
Co-developed-by: Sean Christopherson 
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/ioapic.c   | 4 
 arch/x86/kvm/irq_comm.c | 6 +-
 arch/x86/kvm/lapic.c| 3 ++-
 arch/x86/kvm/x86.c  | 6 ++
 5 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e5b706889d09..7537ba0bada2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -977,6 +977,7 @@ struct kvm_arch {
 
enum kvm_irqchip_mode irqchip_mode;
u8 nr_reserved_ioapic_pins;
+   bool eoi_intercept_unsupported;
 
bool disabled_lapic_found;
 
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 698969e18fe3..e2de6e552d25 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -311,6 +311,10 @@ void kvm_arch_post_irq_ack_notifier_list_update(struct kvm 
*kvm)
 {
if (!ioapic_in_kernel(kvm))
return;
+
+   if (WARN_ON_ONCE(kvm->arch.eoi_intercept_unsupported))
+   return;
+
kvm_make_scan_ioapic_request(kvm);
 }
 
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 4aa1c2e00e2a..1523e9d66867 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -307,6 +307,10 @@ int kvm_set_routing_entry(struct kvm *kvm,
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;
 
+   if (kvm->arch.eoi_intercept_unsupported &&
+   e->msi.data & (1 << MSI_DATA_TRIGGER_SHIFT))
+   return -EINVAL;
+
if (kvm_msi_route_invalid(kvm, e))
return -EINVAL;
break;
@@ -390,7 +394,7 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm)
 
 void kvm_arch_post_irq_routing_update(struct kvm *kvm)
 {
-   if (!irqchip_split(kvm))
+   if (!irqchip_split(kvm) || kvm->arch.eoi_intercept_unsupported)
return;
kvm_make_scan_ioapic_request(kvm);
 }
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 105e7859d1f2..e6c0aaf4044e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -278,7 +278,8 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
if (old)
call_rcu(&old->rcu, kvm_apic_map_free);
 
-   kvm_make_scan_ioapic_request(kvm);
+   if (!kvm->arch.eoi_intercept_unsupported)
+   kvm_make_scan_ioapic_request(kvm);
 }
 
 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4060f3d91f74..8d58141256c5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5454,6 +5454,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto create_irqchip_unlock;
 
r = -EINVAL;
+   if (kvm->arch.eoi_intercept_unsupported)
+   goto create_irqchip_unlock;
+
if (kvm->created_vcpus)
goto create_irqchip_unlock;
 
@@ -5484,6 +5487,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
goto create_pit;
case KVM_CREATE_PIT2:
+   r = -EINVAL;
+   if (kvm->arch.eoi_intercept_unsupported)
+   goto out;
r = -EFAULT;
if (copy_from_user(&u.pit_config, argp,
   sizeof(struct kvm_pit_config)))
-- 
2.17.1

[RFC PATCH 19/67] KVM: x86: Add flag to disallow #MC injection / KVM_X86_SETUP_MCE

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add a flag to disallow MCE injection and reject KVM_X86_SETUP_MCE with
-EINVAL when set.  TDX doesn't support injecting exceptions, including
(virtual) #MCs.

Signed-off-by: Kai Huang 
Co-developed-by: Sean Christopherson 
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c  | 14 +++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e8180a1fe610..70528102d865 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -996,6 +996,7 @@ struct kvm_arch {
 
bool guest_state_protected;
bool irq_injection_disallowed;
+   bool mce_injection_disallowed;
 
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ec66d5d53a1a..2fb0d20c5788 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4095,15 +4095,16 @@ static int vcpu_ioctl_tpr_access_reporting(struct 
kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
u64 mcg_cap)
 {
-   int r;
unsigned bank_num = mcg_cap & 0xff, bank;
 
-   r = -EINVAL;
+   if (vcpu->kvm->arch.mce_injection_disallowed)
+   return -EINVAL;
+
if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
-   goto out;
+   return -EINVAL;
if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff))
-   goto out;
-   r = 0;
+   return -EINVAL;
+
vcpu->arch.mcg_cap = mcg_cap;
/* Init IA32_MCG_CTL to all 1s */
if (mcg_cap & MCG_CTL_P)
@@ -4113,8 +4114,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu 
*vcpu,
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 
kvm_x86_ops.setup_mce(vcpu);
-out:
-   return r;
+   return 0;
 }
 
 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
-- 
2.17.1

[RFC PATCH 03/67] x86/cpu: Move get_builtin_firmware() common code (from microcode only)

2020-11-16 Thread isaku . yamahata

From: Zhang Chen 

Move get_builtin_firmware() to common.c so that it can be used to get
non-ucode firmware, e.g. Intel's SEAM modules, even if MICROCODE=n.

Require the consumers to select FW_LOADER, which is already true for
MICROCODE, instead of having dead code that returns false at runtime.

Signed-off-by: Zhang Chen 
Co-developed-by: Kai Huang 
Signed-off-by: Kai Huang 
Co-developed-by: Sean Christopherson 
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/cpu.h|  5 +
 arch/x86/include/asm/microcode.h  |  3 ---
 arch/x86/kernel/cpu/common.c  | 20 
 arch/x86/kernel/cpu/microcode/core.c  | 18 --
 arch/x86/kernel/cpu/microcode/intel.c |  1 +
 5 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index da78ccbd493b..0096ac7cad0a 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_SMP
 
@@ -37,6 +38,10 @@ extern int _debug_hotplug_cpu(int cpu, int action);
 
 int mwait_usable(const struct cpuinfo_x86 *);
 
+#if defined(CONFIG_MICROCODE) || defined(CONFIG_KVM_INTEL_TDX)
+bool get_builtin_firmware(struct cpio_data *cd, const char *name);
+#endif
+
 unsigned int x86_family(unsigned int sig);
 unsigned int x86_model(unsigned int sig);
 unsigned int x86_stepping(unsigned int sig);
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 2b7cc5397f80..4f10089f30de 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -131,15 +131,12 @@ int __init microcode_init(void);
 extern void __init load_ucode_bsp(void);
 extern void load_ucode_ap(void);
 void reload_early_microcode(void);
-extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
 extern bool initrd_gone;
 #else
 static inline int __init microcode_init(void)  { return 0; };
 static inline void __init load_ucode_bsp(void) { }
 static inline void load_ucode_ap(void) { }
 static inline void reload_early_microcode(void){ }
-static inline bool
-get_builtin_firmware(struct cpio_data *cd, const char *name)   { return false; 
}
 #endif
 
 #endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..87512c5854bb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -87,6 +88,25 @@ void __init setup_cpu_local_masks(void)
alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
 }
 
+#if defined(CONFIG_MICROCODE) || defined(CONFIG_KVM_INTEL_TDX)
+extern struct builtin_fw __start_builtin_fw[];
+extern struct builtin_fw __end_builtin_fw[];
+
+bool get_builtin_firmware(struct cpio_data *cd, const char *name)
+{
+   struct builtin_fw *b_fw;
+
+   for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
+   if (!strcmp(name, b_fw->name)) {
+   cd->size = b_fw->size;
+   cd->data = b_fw->data;
+   return true;
+   }
+   }
+   return false;
+}
+#endif
+
 static void default_init(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/microcode/core.c 
b/arch/x86/kernel/cpu/microcode/core.c
index ec6f0415bc6d..f877a9c19f42 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -22,7 +22,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -140,23 +139,6 @@ static bool __init check_loader_disabled_bsp(void)
return *res;
 }
 
-extern struct builtin_fw __start_builtin_fw[];
-extern struct builtin_fw __end_builtin_fw[];
-
-bool get_builtin_firmware(struct cpio_data *cd, const char *name)
-{
-   struct builtin_fw *b_fw;
-
-   for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
-   if (!strcmp(name, b_fw->name)) {
-   cd->size = b_fw->size;
-   cd->data = b_fw->data;
-   return true;
-   }
-   }
-   return false;
-}
-
 void __init load_ucode_bsp(void)
 {
unsigned int cpuid_1_eax;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c 
b/arch/x86/kernel/cpu/microcode/intel.c
index 6a99535d7f37..50e69d6cb3d9 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
 
-- 
2.17.1

[RFC PATCH 11/67] KVM: x86: Use KVM_BUG/KVM_BUG_ON to handle bugs that are fatal to the VM

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/svm/svm.c |  2 +-
 arch/x86/kvm/vmx/vmx.c | 23 ++-
 arch/x86/kvm/x86.c |  4 
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 2f32fd09e259..e001e3c9e4bc 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1452,7 +1452,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum 
kvm_reg reg)
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
break;
default:
-   WARN_ON_ONCE(1);
+   KVM_BUG_ON(1, vcpu->kvm);
}
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 47b8357b9751..1c9ad3103c87 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2245,7 +2245,7 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum 
kvm_reg reg)
vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
break;
default:
-   WARN_ON_ONCE(1);
+   KVM_BUG_ON(1, vcpu->kvm);
break;
}
 }
@@ -5006,6 +5006,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return kvm_complete_insn_gp(vcpu, err);
case 3:
WARN_ON_ONCE(enable_unrestricted_guest);
+
err = kvm_set_cr3(vcpu, val);
return kvm_complete_insn_gp(vcpu, err);
case 4:
@@ -5031,14 +5032,13 @@ static int handle_cr(struct kvm_vcpu *vcpu)
}
break;
case 2: /* clts */
-   WARN_ONCE(1, "Guest should always own CR0.TS");
-   vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-   trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
-   return kvm_skip_emulated_instruction(vcpu);
+   KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
+   return -EIO;
case 1: /*mov from cr*/
switch (cr) {
case 3:
WARN_ON_ONCE(enable_unrestricted_guest);
+
val = kvm_read_cr3(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
@@ -5377,7 +5377,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 
 static int handle_nmi_window(struct kvm_vcpu *vcpu)
 {
-   WARN_ON_ONCE(!enable_vnmi);
+   if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
+   return -EIO;
+
exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
++vcpu->stat.nmi_window_exits;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5950,7 +5952,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, 
fastpath_t exit_fastpath)
 * below) should never happen as that means we incorrectly allowed a
 * nested VM-Enter with an invalid vmcs12.
 */
-   WARN_ON_ONCE(vmx->nested.nested_run_pending);
+   if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
+   return -EIO;
 
/* If guest state is invalid, start emulating */
if (vmx->emulation_required)
@@ -6300,7 +6303,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
int max_irr;
bool max_irr_updated;
 
-   WARN_ON(!vcpu->arch.apicv_active);
+   if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm))
+   return -EIO;
+
if (pi_test_on(&vmx->pi_desc)) {
pi_clear_on(&vmx->pi_desc);
/*
@@ -6382,7 +6387,7 @@ static void handle_external_interrupt_irqoff(struct 
kvm_vcpu *vcpu)
 {
u32 intr_info = vmx_get_intr_info(vcpu);
 
-   if (WARN_ONCE(!is_external_intr(intr_info),
+   if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1d999b57f21a..19b53aedc6c8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8722,6 +8722,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
 
if (kvm_request_pending(vcpu)) {
+   if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) {
+   r = -EIO;
+   goto out;
+   }
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
if 
(unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
r = 0;
-- 
2.17.1

[RFC PATCH 07/67] KVM: x86: Export kvm_mmio tracepoint for use by TDX for PV MMIO

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0f67f762717a..1d999b57f21a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11237,6 +11237,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned 
long type, gva_t gva)
 EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
-- 
2.17.1

[RFC PATCH 00/67] KVM: X86: TDX support

2020-11-16 Thread isaku . yamahata

From: Isaku Yamahata 

* What's TDX?
TDX stands for Trust Domain Extensions which isolates VMs from
the virtual-machine manager (VMM)/hypervisor and any other software on
the platform. [1]
For details, the specifications, [2], [3], [4], [5], [6], [7], are
available.


* The goal of this RFC patch
The purpose of this post is to get feedback early on high level design
issue of KVM enhancement for TDX. The detailed coding (variable naming
etc) is not cared of. This patch series is incomplete (not working).
Although multiple software components, not only KVM but also QEMU,
guest Linux and virtual bios, need to be updated, this includes only
KVM VMM part. For those who are curious to changes to other
component, there are public repositories at github. [8], [9]


* Terminology
Here are short explanations of key concepts.
For detailed explanation or other terminologies, please refer to the
specifications. [2], [3], [4], [5], [6], [7].
- Trusted Domain(TD)
  Hardware-isolated virtual machines managed by TDX-module.
- Secure-Arbitration Mode(SEAM)
  A new mode of the CPU. It consists of SEAM Root and SEAM Non-Root
  which corresponds to VMX Root and VMX Non-Root.
- TDX-module
  TDX-module runs in SEAM Root that manages TD guest state.
  It provides ABI for VMM to manages TDs. It's expensive operation.
- SEAM loader(SEAMLDR)
  Authenticated Code Module(ACM) to load the TDX-module.
- Secure EPT (S-EPT)
  An extended Page table that is encrypted.
  Shared bit(bit 51 or 47) in GPA selects shared vs private.
  0: private to TD, 1: shared with host VMM.


* Major touch/discussion points
The followings are the major touch points where feedback is wanted.

** the file location of the boot code
BSP launches SEAM Loader on BSP to load TDX module. TDX module is on
all CPUs. The directory, arch/x86/kvm/boot/seam, is chosen to locate
the related files in near directory. When maintenance/enhancement in
future, it will be easy to identify that they're related to be synced
with.

- arch/x86/kvm/boot/seam: the current choice
  Pros:
  - The directory clearly indicates that the code is related to only
KVM.
  - Keep files near to the related code (KVM TDX code).
  Cons:
  - It doesn't follow the existing convention.

Alternative:
The alternative is to follow the existing convention.
- arch/x86/kernel/cpu/
  Pros:
  - It follows the existing convention.
  Cons:
  - It's unclear that it's related to only KVM TDX.

- drivers/firmware/
  As TDX module can be considered a firmware, yet other choice is
  Pros:
  - It follows the existing convention. it clarifies that TDX module
is a firmware.
  Cons:
  - It's hard to understand the firmware is only for KVM TDX.
  - The files are far from the related code(KVM TDX).

** Coexistence of normal(VMX) VM and TD VM
It's required to allow both legacy(normal VMX) VMs and new TD VMs to
coexist. Otherwise the benefits of VM flexibility would be eliminated.
The main issue for it is that the logic of kvm_x86_ops callbacks for
TDX is different from VMX. On the other hand, the variable,
kvm_x86_ops, is global single variable. Not per-VM, not per-vcpu.

Several points to be considered.
  . No or minimal overhead when TDX is disabled(CONFIG_KVM_INTEL_TDX=n).
  . Avoid overhead of indirect call via function pointers.
  . Contain the changes under arch/x86/kvm/vmx directory and share logic
with VMX for maintenance.
Even though the ways to operation on VM (VMX instruction vs TDX
SEAM call) is different, the basic idea remains same. So, many
logic can be shared.
  . Future maintenance
The huge change of kvm_x86_ops in (near) future isn't expected.
a centralized file is acceptable.

- Wrapping kvm x86_ops: The current choice
  Introduce dedicated file for arch/x86/kvm/vmx/main.c (the name,
  main.c, is just chosen to show main entry points for callbacks.) and
  wrapper functions around all the callbacks with
  "if (is-tdx) tdx-callback() else vmx-callback()".

  Pros:
  - No major change in common x86 KVM code. The change is (mostly)
contained under arch/x86/kvm/vmx/.
  - When TDX is disabled(CONFIG_KVM_INTEL_TDX=n), the overhead is
optimized out.
  - Micro optimization by avoiding function pointer.
  Cons:
  - Many boiler plates in arch/x86/kvm/vmx/main.c.

Alternative:
- Introduce another callback layer under arch/x86/kvm/vmx.
  Pros:
  - No major change in common x86 KVM code. The change is (mostly)
contained under arch/x86/kvm/vmx/.
  - clear separation on callbacks.
  Cons:
  - overhead in VMX even when TDX is disabled(CONFIG_KVM_INTEL_TDX=n).

- Allow per-VM kvm_x86_ops callbacks instead of global kvm_x86_ops
  Pros:
  - clear separation on callbacks.
  Cons:
  - Big change in common x86 code.
  - overhead in common code even when TDX is
disabled(CONFIG_KVM_INTEL_TDX=n).

- Introduce new directory arch/x86/kvm/tdx
  Pros:
  - It clarifies that TDX is different from VMX.
  Cons:
  - Given the level of

[RFC PATCH 14/67] KVM: Add max_vcpus field in common 'struct kvm'

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 arch/arm64/include/asm/kvm_host.h | 3 ---
 arch/arm64/kvm/arm.c  | 7 ++-
 arch/arm64/kvm/vgic/vgic-init.c   | 6 +++---
 include/linux/kvm_host.h  | 1 +
 virt/kvm/kvm_main.c   | 3 ++-
 5 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 781d029b8aa8..259b05376807 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -95,9 +95,6 @@ struct kvm_arch {
/* VTCR_EL2 value for this VM */
u64vtcr;
 
-   /* The maximum number of vCPUs depends on the used GIC model */
-   int max_vcpus;
-
/* Interrupt controller */
struct vgic_distvgic;
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 5750ec34960e..b3ba6c66183d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -125,7 +125,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_vgic_early_init(kvm);
 
/* The maximum number of VCPUs is limited by the host's GIC model */
-   kvm->arch.max_vcpus = kvm_arm_default_max_vcpus();
+   kvm->max_vcpus = kvm_arm_default_max_vcpus();
 
return ret;
 out_free_stage2_pgd:
@@ -193,7 +193,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MAX_VCPUS:
case KVM_CAP_MAX_VCPU_ID:
if (kvm)
-   r = kvm->arch.max_vcpus;
+   r = kvm->max_vcpus;
else
r = kvm_arm_default_max_vcpus();
break;
@@ -247,9 +247,6 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int 
id)
if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
return -EBUSY;
 
-   if (id >= kvm->arch.max_vcpus)
-   return -EINVAL;
-
return 0;
 }
 
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 32e32d67a127..9af003b62509 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -97,11 +97,11 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
ret = 0;
 
if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
-   kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
+   kvm->max_vcpus = VGIC_V2_MAX_CPUS;
else
-   kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS;
+   kvm->max_vcpus = VGIC_V3_MAX_CPUS;
 
-   if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) {
+   if (atomic_read(&kvm->online_vcpus) > kvm->max_vcpus) {
ret = -E2BIG;
goto out_unlock;
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ad9b6963d19d..95371750c23f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -467,6 +467,7 @@ struct kvm {
 * and is accessed atomically.
 */
atomic_t online_vcpus;
+   int max_vcpus;
int created_vcpus;
int last_boosted_vcpu;
struct list_head vm_list;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b29b6c3484dd..3dc41b6e12a0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -752,6 +752,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
mutex_init(&kvm->irq_lock);
mutex_init(&kvm->slots_lock);
INIT_LIST_HEAD(&kvm->devices);
+   kvm->max_vcpus = KVM_MAX_VCPUS;
 
BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
 
@@ -3098,7 +3099,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 
id)
return -EINVAL;
 
mutex_lock(&kvm->lock);
-   if (kvm->created_vcpus == KVM_MAX_VCPUS) {
+   if (kvm->created_vcpus >= kvm->max_vcpus) {
mutex_unlock(&kvm->lock);
return -EINVAL;
}
-- 
2.17.1

[RFC PATCH 44/67] KVM: VMX: Modify NMI and INTR handlers to take intr_info as param

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Pass intr_info to the NMI and INTR handlers instead of pulling it from
vcpu_vmx in preparation for sharing the bulk of the handlers with TDX.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/vmx.c | 16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 997a391f0842..5d6c3a50230d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6358,25 +6358,21 @@ static void handle_interrupt_nmi_irqoff(struct kvm_vcpu 
*vcpu, u32 intr_info)
kvm_after_interrupt(vcpu);
 }
 
-static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
+static void handle_exception_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
 {
-   u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
-
/* if exit due to PF check for async PF */
if (is_page_fault(intr_info))
-   vmx->vcpu.arch.apf.host_apf_flags = 
kvm_read_and_reset_apf_flags();
+   vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
else if (is_nmi(intr_info))
-   handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
+   handle_interrupt_nmi_irqoff(vcpu, intr_info);
 }
 
-static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
+static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, u32 
intr_info)
 {
-   u32 intr_info = vmx_get_intr_info(vcpu);
-
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
@@ -6389,9 +6385,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
 
if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
-   handle_external_interrupt_irqoff(vcpu);
+   handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
-   handle_exception_nmi_irqoff(vmx);
+   handle_exception_nmi_irqoff(vcpu, vmx_get_intr_info(vcpu));
 }
 
 static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
-- 
2.17.1

[RFC PATCH 42/67] KVM: x86/mmu: Move 'pfn' variable to caller of direct_page_fault()

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

When adding pages prior to boot, TDX will need the resulting host pfn so
that it can be passed to TDADDPAGE (TDX-SEAM always works with physical
addresses as it has its own page tables).  Start plumbing pfn back up
the page fault stack.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu/mmu.c | 23 +--
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e4e0c883b52d..474173bceb54 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3916,14 +3916,14 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool 
prefault, gfn_t gfn,
 }
 
 static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
-bool prefault, int max_level, bool is_tdp)
+bool prefault, int max_level, bool is_tdp,
+kvm_pfn_t *pfn)
 {
bool write = error_code & PFERR_WRITE_MASK;
bool map_writable;
 
gfn_t gfn = vcpu_gpa_to_gfn_unalias(vcpu, gpa);
unsigned long mmu_seq;
-   kvm_pfn_t pfn;
int r;
 
if (page_fault_handle_page_track(vcpu, error_code, gfn))
@@ -3942,10 +3942,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, 
gpa_t gpa, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
 
-   if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+   if (try_async_pf(vcpu, prefault, gfn, gpa, pfn, write, &map_writable))
return RET_PF_RETRY;
 
-   if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
+   if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, *pfn, ACC_ALL, &r))
return r;
 
r = RET_PF_RETRY;
@@ -3958,25 +3958,27 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, 
gpa_t gpa, u32 error_code,
 
if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, 
max_level,
-   pfn, prefault);
+   *pfn, prefault);
else
-   r = __direct_map(vcpu, gpa, error_code, map_writable, 
max_level, pfn,
-prefault, is_tdp);
+   r = __direct_map(vcpu, gpa, error_code, map_writable, max_level,
+*pfn, prefault, is_tdp);
 
 out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
-   kvm_release_pfn_clean(pfn);
+   kvm_release_pfn_clean(*pfn);
return r;
 }
 
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
u32 error_code, bool prefault)
 {
+   kvm_pfn_t pfn;
+
pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
 
/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
-PG_LEVEL_2M, false);
+PG_LEVEL_2M, false, &pfn);
 }
 
 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
@@ -4015,6 +4017,7 @@ EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
   bool prefault)
 {
+   kvm_pfn_t pfn;
int max_level;
 
for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
@@ -4028,7 +4031,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, 
u32 error_code,
}
 
return direct_page_fault(vcpu, gpa, error_code, prefault,
-max_level, true);
+max_level, true, &pfn);
 }
 
 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
-- 
2.17.1

[RFC PATCH 46/67] KVM: VMX: Split out guts of EPT violation to common/exposed function

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/common.h | 29 +
 arch/x86/kvm/vmx/vmx.c| 32 +---
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index 146f1da9c88d..58edf1296cbd 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -5,8 +5,11 @@
 #include 
 
 #include 
+#include 
 
+#include "mmu.h"
 #include "vmcs.h"
+#include "vmx.h"
 #include "x86.h"
 
 void vmx_handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info);
@@ -51,4 +54,30 @@ static inline void vmx_handle_exception_nmi_irqoff(struct 
kvm_vcpu *vcpu,
vmx_handle_interrupt_nmi_irqoff(vcpu, intr_info);
 }
 
+static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
+unsigned long exit_qualification)
+{
+   u64 error_code;
+
+   /* Is it a read fault? */
+   error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
+? PFERR_USER_MASK : 0;
+   /* Is it a write fault? */
+   error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
+ ? PFERR_WRITE_MASK : 0;
+   /* Is it a fetch fault? */
+   error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
+ ? PFERR_FETCH_MASK : 0;
+   /* ept page table entry is present? */
+   error_code |= (exit_qualification &
+  (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
+   EPT_VIOLATION_EXECUTABLE))
+ ? PFERR_PRESENT_MASK : 0;
+
+   error_code |= (exit_qualification & 0x100) != 0 ?
+  PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+
+   return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+}
+
 #endif /* __KVM_X86_VMX_COMMON_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e8b60d447e27..0dad9d1816b0 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5277,11 +5277,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 
 static int handle_ept_violation(struct kvm_vcpu *vcpu)
 {
-   unsigned long exit_qualification;
-   gpa_t gpa;
-   u64 error_code;
+   unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+   gpa_t gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 
-   exit_qualification = vmx_get_exit_qual(vcpu);
+   trace_kvm_page_fault(gpa, exit_qualification);
 
/*
 * EPT violation happened while executing iret from NMI,
@@ -5290,30 +5289,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 * AAK134, BY25.
 */
if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-   enable_vnmi &&
-   (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+   enable_vnmi && (exit_qualification & INTR_INFO_UNBLOCK_NMI))
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 
GUEST_INTR_STATE_NMI);
 
-   gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-   trace_kvm_page_fault(gpa, exit_qualification);
-
-   /* Is it a read fault? */
-   error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
-? PFERR_USER_MASK : 0;
-   /* Is it a write fault? */
-   error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
- ? PFERR_WRITE_MASK : 0;
-   /* Is it a fetch fault? */
-   error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
- ? PFERR_FETCH_MASK : 0;
-   /* ept page table entry is present? */
-   error_code |= (exit_qualification &
-  (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
-   EPT_VIOLATION_EXECUTABLE))
- ? PFERR_PRESENT_MASK : 0;
-
-   error_code |= (exit_qualification & 0x100) != 0 ?
-  PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
 
vcpu->arch.exit_qualification = exit_qualification;
 
@@ -5328,7 +5306,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, 
gpa)))
return kvm_emulate_instruction(vcpu, 0);
 
-   return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+   return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
 }
 
 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
-- 
2.17.1

[RFC PATCH 56/67] KVM: TDX: Add macro framework to wrap TDX SEAMCALLs

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Co-developed-by: Kai Huang 
Signed-off-by: Kai Huang 
Co-developed-by: Xiaoyao Li 
Signed-off-by: Xiaoyao Li 
Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/tdx_ops.h | 531 +
 1 file changed, 531 insertions(+)
 create mode 100644 arch/x86/kvm/vmx/tdx_ops.h

diff --git a/arch/x86/kvm/vmx/tdx_ops.h b/arch/x86/kvm/vmx/tdx_ops.h
new file mode 100644
index ..a6f87cfe9bda
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx_ops.h
@@ -0,0 +1,531 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_TDX_OPS_H
+#define __KVM_X86_TDX_OPS_H
+
+#include 
+
+#include 
+#include 
+
+struct tdx_ex_ret {
+   union {
+   /* Used to retrieve values from hardware. */
+   struct {
+   u64 rcx;
+   u64 rdx;
+   u64 r8;
+   u64 r9;
+   u64 r10;
+   };
+   /* Functions that return SEPT and level that failed. */
+   struct {
+   u64 septep;
+   int level;
+   };
+   /* TDDBG{RD,WR} return the TDR, field code, and value. */
+   struct {
+   u64 tdr;
+   u64 field;
+   u64 field_val;
+   };
+   /* TDDBG{RD,WR}MEM return the address and its value. */
+   struct {
+   u64 addr;
+   u64 val;
+   };
+   /* TDRDPAGEMD and TDRECLAIMPAGE return page metadata. */
+   struct {
+   u64 page_type;
+   u64 owner;
+   u64 page_size;
+   };
+   /* TDRDSEPT returns the contents of the SEPT entry. */
+   struct {
+   u64 septe;
+   u64 ign;
+   };
+   /*
+* TDSYSINFO returns the buffer address and its size, and the
+* CMR_INFO address and its number of entries.
+*/
+   struct {
+   u64 buffer;
+   u64 nr_bytes;
+   u64 cmr_info;
+   u64 nr_cmr_entries;
+   };
+   /*
+* TDINIT and TDSYSINIT return CPUID info on error.  Note, only
+* the leaf and subleaf are valid on TDINIT error.
+*/
+   struct {
+   u32 leaf;
+   u32 subleaf;
+   u32 eax_mask;
+   u32 ebx_mask;
+   u32 ecx_mask;
+   u32 edx_mask;
+   u32 eax_val;
+   u32 ebx_val;
+   u32 ecx_val;
+   u32 edx_val;
+   };
+   /* TDSYSINITTDMR returns the input PA and next PA. */
+   struct {
+   u64 prev;
+   u64 next;
+   };
+   };
+};
+
+#define pr_seamcall_error(op, err)   \
+   pr_err_ratelimited("SEAMCALL[" #op "] failed: 0x%llx (cpu %d)\n", \
+  SEAMCALL_##op ? (err) : (err), smp_processor_id());
+
+#define TDX_ERR(err, op)   \
+({ \
+   int __ret_warn_on = WARN_ON_ONCE(err);  \
+   \
+   if (unlikely(__ret_warn_on))\
+   pr_seamcall_error(op, err); \
+   __ret_warn_on;  \
+})
+
+#define tdenter(args...)   ({ 0; })
+
+#define seamcall ".byte 0x66,0x0f,0x01,0xcf"
+
+#ifndefINTEL_TDX_BOOT_TIME_SEAMCALL
+#define __seamcall \
+   "1:" seamcall "\n\t"\
+   "jmp 3f\n\t"\
+   "2: call kvm_spurious_fault\n\t"\
+   "3:\n\t"\
+   _ASM_EXTABLE(1b, 2b)
+#else
+/*
+ * The default BUG()s on faults, which is undesirable during boot, and calls
+ * kvm_spurious_fault(), which isn't linkable if KVM is built as a module.
+ * RAX contains '0' on success, TDX-SEAM errno on failure, vector on fault.
+ */
+#define __seamcall \
+   "1:" seamcall "\n\t"\
+   "2: \n\t"   \
+   _ASM_EXTABLE_FAULT(1b, 2b)
+#endif
+
+#define seamcall_N(fn, inputs...)  \
+do {   \
+   u64 ret;\
+   \
+   asm volatile(__seamcall \
+: ASM_CALL_CONSTRAINT, "=a"

[RFC PATCH 33/67] KVM: Export kvm_is_reserved_pfn() for use by TDX

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

TDX will use kvm_is_reserved_pfn() to prevent installing a reserved PFN
int SEPT.  Or rather, to prevent such an attempt, as reserved PFNs are
not covered by TDMRs.

Signed-off-by: Sean Christopherson 
---
 virt/kvm/kvm_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index aa5f27753756..a60dcf682f33 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -185,6 +185,7 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 
return true;
 }
+EXPORT_SYMBOL_GPL(kvm_is_reserved_pfn);
 
 bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
 {
-- 
2.17.1

[RFC PATCH 39/67] KVM: x86/mmu: Refactor shadow walk in __direct_map() to reduce indentation

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Employ a 'continue' to reduce the indentation for linking a new shadow
page during __direct_map() in preparation for linking private pages.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu/mmu.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 732510ecda36..25aafac9b5de 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2953,16 +2953,15 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t 
gpa, u32 error_code,
break;
 
drop_large_spte(vcpu, it.sptep);
-   if (!is_shadow_present_pte(*it.sptep)) {
-   sp = __kvm_mmu_get_page(vcpu, base_gfn,
-   gfn_stolen_bits, it.addr,
-   it.level - 1, true, ACC_ALL);
-
-   link_shadow_page(vcpu, it.sptep, sp);
-   if (is_tdp && huge_page_disallowed &&
-   req_level >= it.level)
-   account_huge_nx_page(vcpu->kvm, sp);
-   }
+   if (is_shadow_present_pte(*it.sptep))
+   continue;
+
+   sp = __kvm_mmu_get_page(vcpu, base_gfn, gfn_stolen_bits,
+   it.addr, it.level - 1, true, ACC_ALL);
+
+   link_shadow_page(vcpu, it.sptep, sp);
+   if (is_tdp && huge_page_disallowed && req_level >= it.level)
+   account_huge_nx_page(vcpu->kvm, sp);
}
 
ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
-- 
2.17.1

[RFC PATCH 63/67] cpu/hotplug: Document that TDX also depends on booting CPUs once

2020-11-16 Thread isaku . yamahata

From: Kai Huang 

Add a comment to explain that TDX also depends on booting logical CPUs
at least once.

TDSYSINITLP must be run on all CPUs, even software disabled CPUs in the
-nosmt case.  Fortunately, current SMT handling for #MC already supports
booting all CPUs once; the to-be-disabled sibling is booted once (and
later put into deep C-state to honor SMT=off) to allow the init code to
set CR4.MCE and avoid an unwanted shutdown on a broadcasted MCE.

Signed-off-by: Kai Huang 
---
 kernel/cpu.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ff2578ecf17..17a8d7db99b2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -435,6 +435,10 @@ static inline bool cpu_smt_allowed(unsigned int cpu)
 * that the init code can get a chance to set CR4.MCE on each
 * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
 * core will shutdown the machine.
+*
+* Intel TDX also requires running TDSYSINITLP on all logical CPUs
+* during boot, booting all CPUs once allows TDX to play nice with
+* 'nosmt'.
 */
return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
 }
-- 
2.17.1

[RFC PATCH 29/67] KVM: x86: Add a switch_db_regs flag to handle TDX's auto-switched behavior

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add a flag, KVM_DEBUGREG_AUTO_SWITCHED, to skip saving/restoring DRs
irrespective of any other flags.  TDX-SEAM unconditionally saves and
restores host DRs, ergo there is nothing to do.

Opportunistically convert the KVM_DEBUGREG_* definitions to use BIT().

Reported-by: Xiaoyao Li 
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h | 7 ---
 arch/x86/kvm/x86.c  | 6 --
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a6c89666ec49..815469875445 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -464,9 +464,10 @@ struct kvm_pmu {
 struct kvm_pmu_ops;
 
 enum {
-   KVM_DEBUGREG_BP_ENABLED = 1,
-   KVM_DEBUGREG_WONT_EXIT = 2,
-   KVM_DEBUGREG_RELOAD = 4,
+   KVM_DEBUGREG_BP_ENABLED = BIT(0),
+   KVM_DEBUGREG_WONT_EXIT  = BIT(1),
+   KVM_DEBUGREG_RELOAD = BIT(2),
+   KVM_DEBUGREG_AUTO_SWITCHED  = BIT(3),
 };
 
 struct kvm_mtrr_range {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 42bd24ba7fdd..09edc3ad 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9009,7 +9009,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_return();
 
-   if (unlikely(vcpu->arch.switch_db_regs)) {
+   if (unlikely(vcpu->arch.switch_db_regs & ~KVM_DEBUGREG_AUTO_SWITCHED)) {
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
@@ -9029,6 +9029,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 */
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+   WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCHED);
kvm_x86_ops.sync_dirty_debug_regs(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
@@ -9042,7 +9043,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 * care about the messed up debug address registers. But if
 * we have some of them active, restore the old state.
 */
-   if (hw_breakpoint_active())
+   if (hw_breakpoint_active() &&
+   !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCHED))
hw_breakpoint_restore();
 
vcpu->arch.last_vmentry_cpu = vcpu->cpu;
-- 
2.17.1

[RFC PATCH 57/67] KVM: TDX: Stub in tdx.h with structs, accessors, and VMCS helpers

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Stub in kvm_tdx, vcpu_tdx, their various accessors, and VMCS helpers.
The VMCS helpers, which rely on the stubs, will be used by preparatory
patches to move VMX functions for accessing VMCS state to common code.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/tdx.h | 167 +
 1 file changed, 167 insertions(+)
 create mode 100644 arch/x86/kvm/vmx/tdx.h

diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
new file mode 100644
index ..b55108a8e484
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_TDX_H
+#define __KVM_X86_TDX_H
+
+#include 
+#include 
+
+#include "tdx_arch.h"
+#include "tdx_errno.h"
+#include "tdx_ops.h"
+
+#ifdef CONFIG_KVM_INTEL_TDX
+
+struct tdx_td_page {
+   unsigned long va;
+   hpa_t pa;
+   bool added;
+};
+
+struct kvm_tdx {
+   struct kvm kvm;
+
+   struct tdx_td_page tdr;
+   struct tdx_td_page tdcs[TDX1_NR_TDCX_PAGES];
+};
+
+struct vcpu_tdx {
+   struct kvm_vcpu vcpu;
+
+   struct tdx_td_page tdvpr;
+   struct tdx_td_page tdvpx[TDX1_NR_TDVPX_PAGES];
+};
+
+static inline bool is_td(struct kvm *kvm)
+{
+   return kvm->arch.vm_type == KVM_X86_TDX_VM;
+}
+
+static inline bool is_td_vcpu(struct kvm_vcpu *vcpu)
+{
+   return is_td(vcpu->kvm);
+}
+
+static inline bool is_debug_td(struct kvm_vcpu *vcpu)
+{
+   return !vcpu->kvm->arch.guest_state_protected;
+}
+
+static inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
+{
+   return container_of(kvm, struct kvm_tdx, kvm);
+}
+
+static inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
+{
+   return container_of(vcpu, struct vcpu_tdx, vcpu);
+}
+
+static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
+{
+   BUILD_BUG_ON_MSG(__builtin_constant_p(field) && (field) & 0x1,
+"Read/Write to TD VMCS *_HIGH fields not supported");
+
+   BUILD_BUG_ON(bits != 16 && bits != 32 && bits != 64);
+
+   BUILD_BUG_ON_MSG(bits != 64 && __builtin_constant_p(field) &&
+(((field) & 0x6000) == 0x2000 ||
+ ((field) & 0x6000) == 0x6000),
+"Invalid TD VMCS access for 64-bit field");
+   BUILD_BUG_ON_MSG(bits != 32 && __builtin_constant_p(field) &&
+((field) & 0x6000) == 0x4000,
+"Invalid TD VMCS access for 32-bit field");
+   BUILD_BUG_ON_MSG(bits != 16 && __builtin_constant_p(field) &&
+((field) & 0x6000) == 0x,
+"Invalid TD VMCS access for 16-bit field");
+}
+
+static __always_inline void tdvps_gpr_check(u64 field, u8 bits)
+{
+   BUILD_BUG_ON_MSG(__builtin_constant_p(field) && (field) >= NR_VCPU_REGS,
+"Invalid TD guest GPR index");
+}
+
+static __always_inline void tdvps_apic_check(u64 field, u8 bits) {}
+static __always_inline void tdvps_dr_check(u64 field, u8 bits) {}
+static __always_inline void tdvps_state_check(u64 field, u8 bits) {}
+static __always_inline void tdvps_msr_check(u64 field, u8 bits) {}
+static __always_inline void tdvps_management_check(u64 field, u8 bits) {}
+
+#define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass)
   \
+static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx,  
\
+   u32 field) \
+{ \
+   struct tdx_ex_ret ex_ret;  \
+   u64 err;   \
+  \
+   tdvps_##lclass##_check(field, bits);   \
+   err = tdrdvps(tdx->tdvpr.pa, TDVPS_##uclass(field), &ex_ret);  \
+   if (unlikely(err)) {   \
+   pr_err("TDRDVPS["#uclass".0x%x] failed: 0x%llx\n", field, err);\
+   return 0;  \
+   }  \
+   return (u##bits)ex_ret.r8; \
+} \
+static __always_inline void td_##lclass##_write##bits(struct vcpu_tdx *tdx,
\
+ u32 field, u##bits val)  \
+{ \
+   struct tdx_ex_ret ex_ret;  \
+   u64 err;   \
+  \
+   tdvps_##lclass##_ch

[RFC PATCH 34/67] KVM: x86: Add infrastructure for stolen GPA bits

2020-11-16 Thread isaku . yamahata

From: Rick Edgecombe 

Add support in KVM's MMU for aliasing multiple GPAs (from a hardware
perspective) to a single GPA (from a memslot perspective). GPA alising
will be used to repurpose GPA bits as attribute bits, e.g. to expose an
execute-only permission bit to the guest. To keep the implementation
simple (relatively speaking), GPA aliasing is only supported via TDP.

Today KVM assumes two things that are broken by GPA aliasing.
  1. GPAs coming from hardware can be simply shifted to get the GFNs.
  2. GPA bits 51:MAXPHYADDR are reserved to zero.

With GPA aliasing, translating a GPA to GFN requires masking off the
repurposed bit, and a repurposed bit may reside in 51:MAXPHYADDR.

To support GPA aliasing, introduce the concept of per-VM GPA stolen bits,
that is, bits stolen from the GPA to act as new virtualized attribute
bits. A bit in the mask will cause the MMU code to create aliases of the
GPA. It can also be used to find the GFN out of a GPA coming from a tdp
fault.

To handle case (1) from above, retain any stolen bits when passing a GPA
in KVM's MMU code, but strip them when converting to a GFN so that the
GFN contains only the "real" GFN, i.e. never has repurposed bits set.

GFNs (without stolen bits) continue to be used to:
-Specify physical memory by userspace via memslots
-Map GPAs to TDP PTEs via RMAP
-Specify dirty tracking and write protection
-Look up MTRR types
-Inject async page faults

Since there are now multiple aliases for the same aliased GPA, when
userspace memory backing the memslots is paged out, both aliases need to be
modified. Fortunately this happens automatically. Since rmap supports
multiple mappings for the same GFN for PTE shadowing based paging, by
adding/removing each alias PTE with its GFN, kvm_handle_hva() based
operations will be applied to both aliases.

In the case of the rmap being removed in the future, the needed
information could be recovered by iterating over the stolen bits and
walking the TDP page tables.

For TLB flushes that are address based, make sure to flush both aliases
in the stolen bits case.

Only support stolen bits in 64 bit guest paging modes (long, PAE).
Features that use this infrastructure should restrict the stolen bits to
exclude the other paging modes. Don't support stolen bits for shadow EPT.

Signed-off-by: Rick Edgecombe 
---
 arch/x86/kvm/mmu.h  | 26 ++
 arch/x86/kvm/mmu/mmu.c  | 86 ++---
 arch/x86/kvm/mmu/mmu_internal.h |  1 +
 arch/x86/kvm/mmu/paging_tmpl.h  | 25 ++
 4 files changed, 101 insertions(+), 37 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 9c4a9c8e43d9..7ce8f0256d6d 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -220,4 +220,30 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
 int kvm_mmu_post_init_vm(struct kvm *kvm);
 void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
 
+static inline gfn_t kvm_gfn_stolen_mask(struct kvm *kvm)
+{
+   /* Currently there are no stolen bits in KVM */
+   return 0;
+}
+
+static inline gfn_t vcpu_gfn_stolen_mask(struct kvm_vcpu *vcpu)
+{
+   return kvm_gfn_stolen_mask(vcpu->kvm);
+}
+
+static inline gpa_t kvm_gpa_stolen_mask(struct kvm *kvm)
+{
+   return kvm_gfn_stolen_mask(kvm) << PAGE_SHIFT;
+}
+
+static inline gpa_t vcpu_gpa_stolen_mask(struct kvm_vcpu *vcpu)
+{
+   return kvm_gpa_stolen_mask(vcpu->kvm);
+}
+
+static inline gfn_t vcpu_gpa_to_gfn_unalias(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+   return (gpa >> PAGE_SHIFT) & ~vcpu_gfn_stolen_mask(vcpu);
+}
+
 #endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index bebd2b6ebcad..76de8d48165d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -187,27 +187,37 @@ static inline bool 
kvm_available_flush_tlb_with_range(void)
return kvm_x86_ops.tlb_remote_flush_with_range;
 }
 
-static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
-   struct kvm_tlb_range *range)
-{
-   int ret = -ENOTSUPP;
-
-   if (range && kvm_x86_ops.tlb_remote_flush_with_range)
-   ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range);
-
-   if (ret)
-   kvm_flush_remote_tlbs(kvm);
-}
-
 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages)
 {
struct kvm_tlb_range range;
+   u64 gfn_stolen_mask;
+
+   if (!kvm_x86_ops.tlb_remote_flush_with_range)
+   goto generic_flush;
+
+   /*
+* Fall back to the big hammer flush if there is more than one
+* GPA alias that needs to be flushed.
+*/
+   gfn_stolen_mask = kvm_gfn_stolen_mask(kvm);
+   if (hweight64(gfn_stolen_mask) > 1)
+   goto generic_flush;
 
range.start_gfn = start_gfn;
range.pages = pages;
+   if (kvm_x86_ops.tlb_remote_flush_with_range(kvm, &range))
+   goto generic_flush;
+
+   if (!gfn_stolen_mas

[RFC PATCH 65/67] KVM: x86: Mark the VM (TD) as bugged if non-coherent DMA is detected

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

TDX is not supported on platforms with non-coherent IOMMUs, freak out if
one is encountered, and because SEPT doesn't allow the memtype control
that's needed to support non-coherent DMA.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5566e7f25ce6..05dbdfdd7a8b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11144,6 +11144,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
 
 void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
 {
+   KVM_BUG_ON(kvm->arch.vm_type == KVM_X86_TDX_VM, kvm);
atomic_inc(&kvm->arch.noncoherent_dma_count);
 }
 EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
-- 
2.17.1

[RFC PATCH 55/67] KVM: TDX: Add SEAMRR related MSRs macro definition

2020-11-16 Thread isaku . yamahata

From: Kai Huang 

Two new MSRs IA32_SEAMRR_PHYS_BASE and IA32_SEAMRR_PHYS_MASK are added
in SPR for TDX. Add macro definition for both of them.

Signed-off-by: Kai Huang 
---
 arch/x86/include/asm/msr-index.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index aad12236b33c..f42da6b11b42 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -924,4 +924,12 @@
 #define MSR_VM_IGNNE0xc0010115
 #define MSR_VM_HSAVE_PA 0xc0010117
 
+/* Intel SEAMRR */
+#define MSR_IA32_SEAMRR_PHYS_BASE  0x1400
+#define MSR_IA32_SEAMRR_PHYS_MASK  0x1401
+
+#define MSR_IA32_SEAMRR_PHYS_BASE_CONFIGURED   (1ULL << 3)
+#define MSR_IA32_SEAMRR_PHYS_MASK_ENABLED  (1ULL << 11)
+#define MSR_IA32_SEAMRR_PHYS_MASK_LOCKED   (1ULL << 10)
+
 #endif /* _ASM_X86_MSR_INDEX_H */
-- 
2.17.1

[RFC PATCH 60/67] KVM: VMX: MOVE GDT and IDT accessors to common code

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/main.c |  6 --
 arch/x86/kvm/vmx/vmx.c  | 12 
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 30b1815fd5a7..53e1ea8df861 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -317,7 +317,8 @@ static int vt_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
 static void vt_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-   vmx_get_idt(vcpu, dt);
+   dt->size = vmread32(vcpu, GUEST_IDTR_LIMIT);
+   dt->address = vmreadl(vcpu, GUEST_IDTR_BASE);
 }
 
 static void vt_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -327,7 +328,8 @@ static void vt_set_idt(struct kvm_vcpu *vcpu, struct 
desc_ptr *dt)
 
 static void vt_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-   vmx_get_gdt(vcpu, dt);
+   dt->size = vmread32(vcpu, GUEST_GDTR_LIMIT);
+   dt->address = vmreadl(vcpu, GUEST_GDTR_BASE);
 }
 
 static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8bd71b91c6f0..93b319eacdfa 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3217,24 +3217,12 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, 
int *db, int *l)
*l = (ar >> 13) & 1;
 }
 
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
-   dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
-   dt->address = vmcs_readl(GUEST_IDTR_BASE);
-}
-
 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
vmcs_writel(GUEST_IDTR_BASE, dt->address);
 }
 
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
-   dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
-   dt->address = vmcs_readl(GUEST_GDTR_BASE);
-}
-
 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
-- 
2.17.1

[RFC PATCH 36/67] KVM: x86/mmu: Track shadow MMIO value on a per-VM basis

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu.h  |  4 +++-
 arch/x86/kvm/mmu/mmu.c  | 24 +++-
 arch/x86/kvm/mmu/spte.c | 26 ++
 arch/x86/kvm/mmu/spte.h |  2 +-
 arch/x86/kvm/svm/svm.c  |  2 +-
 arch/x86/kvm/vmx/vmx.c  | 18 +++---
 7 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6dfc09092bc9..d4fd9859fcd5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -915,6 +915,8 @@ struct kvm_arch {
struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head;
 
+   u64 shadow_mmio_value;
+
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
bool iommu_noncoherent;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7ce8f0256d6d..05c2898cb2a2 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,7 +52,9 @@ static inline u64 rsvd_bits(int s, int e)
return ((1ULL << (e - s + 1)) - 1) << s;
 }
 
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask);
+void kvm_mmu_set_mmio_spte_mask(struct kvm *kvm, u64 mmio_value,
+   u64 access_mask);
+void kvm_mmu_set_default_mmio_spte_mask(u64 mask);
 
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c4d657b26066..da2a58fa86a8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5507,6 +5507,9 @@ void kvm_mmu_init_vm(struct kvm *kvm)
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
kvm_page_track_register_notifier(kvm, node);
+
+   kvm_mmu_set_mmio_spte_mask(kvm, shadow_default_mmio_mask,
+  ACC_WRITE_MASK | ACC_USER_MASK);
 }
 
 void kvm_mmu_uninit_vm(struct kvm *kvm)
@@ -5835,25 +5838,6 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
 }
 
-static void kvm_set_mmio_spte_mask(void)
-{
-   u64 mask;
-
-   /*
-* Set a reserved PA bit in MMIO SPTEs to generate page faults with
-* PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
-* paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
-* 52-bit physical addresses then there are no reserved PA bits in the
-* PTEs and so the reserved PA approach must be disabled.
-*/
-   if (shadow_phys_bits < 52)
-   mask = BIT_ULL(51) | PT_PRESENT_MASK;
-   else
-   mask = 0;
-
-   kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
-}
-
 static bool get_nx_auto_mode(void)
 {
/* Return true when CPU has the bug, and mitigations are ON */
@@ -5919,8 +5903,6 @@ int kvm_mmu_module_init(void)
 
kvm_mmu_reset_all_pte_masks();
 
-   kvm_set_mmio_spte_mask();
-
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
0, SLAB_ACCOUNT, NULL);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index fcac2cac78fe..574c8ccac0bf 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -22,7 +22,7 @@ u64 __read_mostly shadow_x_mask; /* mutual exclusive with 
nx_mask */
 u64 __read_mostly shadow_user_mask;
 u64 __read_mostly shadow_accessed_mask;
 u64 __read_mostly shadow_dirty_mask;
-u64 __read_mostly shadow_mmio_value;
+u64 __read_mostly shadow_default_mmio_mask;
 u64 __read_mostly shadow_mmio_access_mask;
 u64 __read_mostly shadow_present_mask;
 u64 __read_mostly shadow_me_mask;
@@ -52,7 +52,7 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned 
int access)
u64 gpa = gfn << PAGE_SHIFT;
 
access &= shadow_mmio_access_mask;
-   mask |= shadow_mmio_value | access;
+   mask |= vcpu->kvm->arch.shadow_mmio_value | SPTE_MMIO_MASK | access;
mask |= gpa | shadow_nonpresent_or_rsvd_mask;
mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
@@ -242,12 +242,13 @@ u64 mark_spte_for_access_track(u64 spte)
return spte;
 }
 
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
+void kvm_mmu_set_mmio_spte_mask(struct kvm *kvm, u64 mmio_value,
+   u64 access_mask)
 {
BUG_ON((u64)(unsigned)access_mask != access_mask);
WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << 
SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
-   shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
+   kvm->arch.shadow_mmio_val

[RFC PATCH 32/67] KVM: x86: Add guest_supported_xss placholder

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add a per-vcpu placeholder for the support XSS of the guest so that the
TDX configuration code doesn't need to hack in manual computation of the
supported XSS.  KVM XSS enabling is currently being upstreamed, i.e.
guest_supported_xss will no longer be a placeholder by the time TDX is
ready for upstreaming (hopefully).

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 815469875445..6dfc09092bc9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -612,6 +612,7 @@ struct kvm_vcpu_arch {
 
u64 xcr0;
u64 guest_supported_xcr0;
+   u64 guest_supported_xss;
 
struct kvm_pio_request pio;
void *pio_data;
-- 
2.17.1

[RFC PATCH 53/67] KVM: TDX: Add architectural definitions for structures and values

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Co-developed-by: Kai Huang 
Signed-off-by: Kai Huang 
Co-developed-by: Xiaoyao Li 
Signed-off-by: Xiaoyao Li 
Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/tdx_arch.h | 230 
 1 file changed, 230 insertions(+)
 create mode 100644 arch/x86/kvm/vmx/tdx_arch.h

diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h
new file mode 100644
index ..d13db55e5086
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx_arch.h
@@ -0,0 +1,230 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_TDX_ARCH_H
+#define __KVM_X86_TDX_ARCH_H
+
+#include 
+
+/*
+ * SEAMCALL API function leaf
+ */
+#define SEAMCALL_TDENTER   0
+#define SEAMCALL_TDADDCX   1
+#define SEAMCALL_TDADDPAGE 2
+#define SEAMCALL_TDADDSEPT 3
+#define SEAMCALL_TDADDVPX  4
+#define SEAMCALL_TDASSIGNHKID  5
+#define SEAMCALL_TDAUGPAGE 6
+#define SEAMCALL_TDBLOCK   7
+#define SEAMCALL_TDCONFIGKEY   8
+#define SEAMCALL_TDCREATE  9
+#define SEAMCALL_TDCREATEVP10
+#define SEAMCALL_TDDBGRD   11
+#define SEAMCALL_TDDBGRDMEM12
+#define SEAMCALL_TDDBGWR   13
+#define SEAMCALL_TDDBGWRMEM14
+#define SEAMCALL_TDDEMOTEPAGE  15
+#define SEAMCALL_TDEXTENDMR16
+#define SEAMCALL_TDFINALIZEMR  17
+#define SEAMCALL_TDFLUSHVP 18
+#define SEAMCALL_TDFLUSHVPDONE 19
+#define SEAMCALL_TDFREEHKIDS   20
+#define SEAMCALL_TDINIT21
+#define SEAMCALL_TDINITVP  22
+#define SEAMCALL_TDPROMOTEPAGE 23
+#define SEAMCALL_TDRDPAGEMD24
+#define SEAMCALL_TDRDSEPT  25
+#define SEAMCALL_TDRDVPS   26
+#define SEAMCALL_TDRECLAIMHKIDS27
+#define SEAMCALL_TDRECLAIMPAGE 28
+#define SEAMCALL_TDREMOVEPAGE  29
+#define SEAMCALL_TDREMOVESEPT  30
+#define SEAMCALL_TDSYSCONFIGKEY31
+#define SEAMCALL_TDSYSINFO 32
+#define SEAMCALL_TDSYSINIT 33
+
+#define SEAMCALL_TDSYSINITLP   35
+#define SEAMCALL_TDSYSINITTDMR 36
+#define SEAMCALL_TDTEARDOWN37
+#define SEAMCALL_TDTRACK   38
+#define SEAMCALL_TDUNBLOCK 39
+#define SEAMCALL_TDWBCACHE 40
+#define SEAMCALL_TDWBINVDPAGE  41
+#define SEAMCALL_TDWRSEPT  42
+#define SEAMCALL_TDWRVPS   43
+#define SEAMCALL_TDSYSSHUTDOWNLP   44
+#define SEAMCALL_TDSYSCONFIG   45
+
+#define TDVMCALL_MAP_GPA   0x10001
+#define TDVMCALL_REPORT_FATAL_ERROR0x10003
+
+/* TDX control structure (TDR/TDCS/TDVPS) field access codes */
+#define TDX_CLASS_SHIFT56
+#define TDX_FIELD_MASK GENMASK_ULL(31, 0)
+
+#define BUILD_TDX_FIELD(class, field)  \
+   (((u64)(class) << TDX_CLASS_SHIFT) | ((u64)(field) & TDX_FIELD_MASK))
+
+/* @field is the VMCS field encoding */
+#define TDVPS_VMCS(field)  BUILD_TDX_FIELD(0, (field))
+
+/*
+ * @offset is the offset (in bytes) from the beginning of the architectural
+ * virtual APIC page.
+ */
+#define TDVPS_APIC(offset) BUILD_TDX_FIELD(1, (offset))
+
+/* @gpr is the index of a general purpose register, e.g. eax=0 */
+#define TDVPS_GPR(gpr) BUILD_TDX_FIELD(16, (gpr))
+
+#define TDVPS_DR(dr)   BUILD_TDX_FIELD(17, (0 + (dr)))
+
+enum tdx_guest_other_state {
+   TD_VCPU_XCR0 = 32,
+   TD_VCPU_IWK_ENCKEY0 = 64,
+   TD_VCPU_IWK_ENCKEY1,
+   TD_VCPU_IWK_ENCKEY2,
+   TD_VCPU_IWK_ENCKEY3,
+   TD_VCPU_IWK_INTKEY0 = 68,
+   TD_VCPU_IWK_INTKEY1,
+   TD_VCPU_IWK_FLAGS = 70,
+};
+
+/* @field is any of enum tdx_guest_other_state */
+#define TDVPS_STATE(field) BUILD_TDX_FIELD(17, (field))
+
+/* @msr is the MSR index */
+#define TDVPS_MSR(msr) BUILD_TDX_FIELD(19, (msr))
+
+/* Management class fields */
+enum tdx_guest_management {
+   TD_VCPU_PEND_NMI = 11,
+};
+
+/* @field is any of enum tdx_guest_management */
+#define TDVPS_MANAGEMENT(field)BUILD_TDX_FIELD(32, (field))
+
+#define TDX1_NR_TDCX_PAGES 4
+#define TDX1_NR_TDVPX_PAGES5
+
+#define TDX1_MAX_NR_CPUID_CONFIGS  6
+#define TDX1_MAX_NR_CMRS   32
+#define TDX1_MAX_NR_TDMRS  64
+#define TDX1_EXTENDMR_CHUNKSIZE256
+
+struct tdx_cpuid_config {
+   u32 leaf;
+   u32 sub_leaf;
+   u32 eax;
+   u32 ebx;
+   u32 ecx;
+   u32 edx;
+} __packed;
+
+struct tdx_cpuid_value {
+   u32 eax;
+   u32 ebx;
+   u32 ecx;
+   u32 edx;
+} __packed;
+
+#define TDX1_TD_ATTRIBUTE_DEBUGBIT_ULL(0)
+#define TDX1_TD_ATTRIBUTE_SYSPROF  BIT_ULL(1)
+#define TDX1_TD_ATTRIBUTE_PKS  BIT_ULL(30)
+#define TDX1_TD_ATTRIBUTE_KL   BIT_ULL(31)
+#define TDX1_TD_ATTRIBUTE_PERFMON  BIT_ULL(63)
+
+/*
+ * TD_PARAMS is

[RFC PATCH 40/67] KVM: x86/mmu: Return old SPTE from mmu_spte_clear_track_bits()

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Return the old SPTE when clearing a SPTE and push the "old SPTE present"
check to the caller.  Private shadow page support will use the old SPTE
in rmap_remove() to determine whether or not there is a linked private
shadow page.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu/mmu.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 25aafac9b5de..8d847c3abf1d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -552,9 +552,9 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
  * Rules for using mmu_spte_clear_track_bits:
  * It sets the sptep from present to nonpresent, and track the
  * state bits, it is used to clear the last level sptep.
- * Returns non-zero if the PTE was previously valid.
+ * Returns the old PTE.
  */
-static int mmu_spte_clear_track_bits(u64 *sptep)
+static u64 mmu_spte_clear_track_bits(u64 *sptep)
 {
kvm_pfn_t pfn;
u64 old_spte = *sptep;
@@ -565,7 +565,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
old_spte = __update_clear_spte_slow(sptep, shadow_init_value);
 
if (!is_shadow_present_pte(old_spte))
-   return 0;
+   return old_spte;
 
pfn = spte_to_pfn(old_spte);
 
@@ -582,7 +582,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
if (is_dirty_spte(old_spte))
kvm_set_pfn_dirty(pfn);
 
-   return 1;
+   return old_spte;
 }
 
 /*
@@ -1113,7 +1113,9 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
 
 static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
-   if (mmu_spte_clear_track_bits(sptep))
+   u64 old_spte = mmu_spte_clear_track_bits(sptep);
+
+   if (is_shadow_present_pte(old_spte))
rmap_remove(kvm, sptep);
 }
 
-- 
2.17.1

[RFC PATCH 49/67] KVM: VMX: Add 'main.c' to wrap VMX and TDX

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Wrap the VMX kvm_x86_ops hooks in preparation of adding TDX, which can
coexist with VMX, i.e. KVM can run both VMs and TDs.  Use 'vt' for the
naming scheme as a nod to VT-x and as a concatenation of VmxTdx.

Co-developed-by: Xiaoyao Li 
Signed-off-by: Xiaoyao Li 
Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/Makefile   |   2 +-
 arch/x86/kvm/vmx/main.c | 720 
 arch/x86/kvm/vmx/vmx.c  | 304 -
 3 files changed, 784 insertions(+), 242 deletions(-)
 create mode 100644 arch/x86/kvm/vmx/main.c

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b80e16d4..4192b252eba0 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -18,7 +18,7 @@ kvm-y += x86.o emulate.o i8259.o irq.o 
lapic.o \
   hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
   mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o
 
-kvm-intel-y+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o 
\
+kvm-intel-y+= vmx/main.o vmx/vmenter.o vmx/pmu_intel.o 
vmx/vmcs12.o \
   vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
 kvm-amd-y  += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o 
svm/avic.o svm/sev.o
 
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
new file mode 100644
index ..85bc238c0852
--- /dev/null
+++ b/arch/x86/kvm/vmx/main.c
@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+
+#include "vmx.c"
+
+static struct kvm_x86_ops vt_x86_ops __initdata;
+
+static int __init vt_cpu_has_kvm_support(void)
+{
+   return cpu_has_vmx();
+}
+
+static int __init vt_disabled_by_bios(void)
+{
+   return vmx_disabled_by_bios();
+}
+
+static int __init vt_check_processor_compatibility(void)
+{
+   int ret;
+
+   ret = vmx_check_processor_compat();
+   if (ret)
+   return ret;
+
+   return 0;
+}
+
+static __init int vt_hardware_setup(void)
+{
+   int ret;
+
+   ret = hardware_setup(&vt_x86_ops);
+   if (ret)
+   return ret;
+
+   return 0;
+}
+
+static int vt_hardware_enable(void)
+{
+   return hardware_enable();
+}
+
+static void vt_hardware_disable(void)
+{
+   hardware_disable();
+}
+
+static bool vt_cpu_has_accelerated_tpr(void)
+{
+   return report_flexpriority();
+}
+
+static bool vt_is_vm_type_supported(unsigned long type)
+{
+   return type == KVM_X86_LEGACY_VM;
+}
+
+static int vt_vm_init(struct kvm *kvm)
+{
+   return vmx_vm_init(kvm);
+}
+
+static void vt_vm_teardown(struct kvm *kvm)
+{
+
+}
+
+static void vt_vm_destroy(struct kvm *kvm)
+{
+
+}
+
+static int vt_vcpu_create(struct kvm_vcpu *vcpu)
+{
+   return vmx_create_vcpu(vcpu);
+}
+
+static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu)
+{
+   return vmx_vcpu_run(vcpu);
+}
+
+static void vt_vcpu_free(struct kvm_vcpu *vcpu)
+{
+   return vmx_free_vcpu(vcpu);
+}
+
+static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+   return vmx_vcpu_reset(vcpu, init_event);
+}
+
+static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+   return vmx_vcpu_load(vcpu, cpu);
+}
+
+static void vt_vcpu_put(struct kvm_vcpu *vcpu)
+{
+   return vmx_vcpu_put(vcpu);
+}
+
+static int vt_handle_exit(struct kvm_vcpu *vcpu,
+enum exit_fastpath_completion fastpath)
+{
+   return vmx_handle_exit(vcpu, fastpath);
+}
+
+static void vt_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+{
+   vmx_handle_exit_irqoff(vcpu);
+}
+
+static int vt_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+   return vmx_skip_emulated_instruction(vcpu);
+}
+
+static void vt_update_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+   vmx_update_emulated_instruction(vcpu);
+}
+
+static int vt_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+   return vmx_set_msr(vcpu, msr_info);
+}
+
+static int vt_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+   return vmx_smi_allowed(vcpu, for_injection);
+}
+
+static int vt_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+   return vmx_pre_enter_smm(vcpu, smstate);
+}
+
+static int vt_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+{
+   return vmx_pre_leave_smm(vcpu, smstate);
+}
+
+static void vt_enable_smi_window(struct kvm_vcpu *vcpu)
+{
+   /* RSM will cause a vmexit anyway.  */
+}
+
+static bool vt_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn,
+  int insn_len)
+{
+   return vmx_can_emulate_instruction(vcpu, insn, insn_len);
+}
+
+static int vt_check_intercept(struct kvm_vcpu *vcpu,
+struct x86_instruction_info *info,
+enum x86_intercept_stage stage,
+struct x86_exception *exception)
+{
+   return vmx_check_intercept(vcpu, info, stage, exception);
+}
+
+static bool vt_a

[RFC PATCH 41/67] KVM: x86/mmu: Frame in support for private/inaccessible shadow pages

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add kvm_x86_ops hooks to set/clear private SPTEs, i.e. SEPT entries, and
to link/free private shadow pages, i.e. non-leaf SEPT pages.

Because SEAMCALLs are bloody expensive, and because KVM's MMU is already
complex enough, TDX's SEPT will mirror KVM's shadow pages instead of
replacing them outright.  This costs extra memory, but is simpler and
far more performant.

Add a separate list for tracking active private shadow pages.  Zapping
and freeing SEPT entries is subject to very different rules than normal
pages/memory, and need to be preserved (along with their shadow page
counterparts) when KVM gets trigger happy, e.g. zaps everything during a
memslot update.

Zap any aliases of a GPA when mapping in a guest that supports guest
private GPAs.  This is necessary to avoid integrity failures with TDX
due to pointing shared and private GPAs at the same HPA.

Do not prefetch private pages (this should probably be a property of the
VM).

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  23 ++-
 arch/x86/kvm/mmu.h  |   3 +-
 arch/x86/kvm/mmu/mmu.c  | 270 +++-
 arch/x86/kvm/mmu/mmu_internal.h |   4 +
 arch/x86/kvm/mmu/spte.h |  11 +-
 arch/x86/kvm/x86.c  |   4 +-
 6 files changed, 269 insertions(+), 46 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d4fd9859fcd5..9f7349aa3c77 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -361,6 +361,7 @@ struct kvm_mmu {
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
   u64 *spte, const void *pte);
hpa_t root_hpa;
+   hpa_t private_root_hpa;
gpa_t root_pgd;
union kvm_mmu_role mmu_role;
u8 root_level;
@@ -595,6 +596,7 @@ struct kvm_vcpu_arch {
struct kvm_mmu_memory_cache mmu_shadow_page_cache;
struct kvm_mmu_memory_cache mmu_gfn_array_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
+   struct kvm_mmu_memory_cache mmu_private_sp_cache;
 
/*
 * QEMU userspace and the guest each have their own FPU state.
@@ -910,6 +912,7 @@ struct kvm_arch {
 * Hash table of struct kvm_mmu_page.
 */
struct list_head active_mmu_pages;
+   struct list_head private_mmu_pages;
struct list_head zapped_obsolete_pages;
struct list_head lpage_disallowed_mmu_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
@@ -1020,6 +1023,8 @@ struct kvm_arch {
struct list_head tdp_mmu_roots;
/* List of struct tdp_mmu_pages not being used as roots */
struct list_head tdp_mmu_pages;
+
+   gfn_t gfn_shared_mask;
 };
 
 struct kvm_vm_stat {
@@ -1199,6 +1204,17 @@ struct kvm_x86_ops {
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd,
 int pgd_level);
 
+   void (*set_private_spte)(struct kvm_vcpu *vcpu, gfn_t gfn, int level,
+kvm_pfn_t pfn);
+   void (*drop_private_spte)(struct kvm *kvm, gfn_t gfn, int level,
+ kvm_pfn_t pfn);
+   void (*zap_private_spte)(struct kvm *kvm, gfn_t gfn, int level);
+   void (*unzap_private_spte)(struct kvm *kvm, gfn_t gfn, int level);
+   int (*link_private_sp)(struct kvm_vcpu *vcpu, gfn_t gfn, int level,
+  void *private_sp);
+   int (*free_private_sp)(struct kvm *kvm, gfn_t gfn, int level,
+  void *private_sp);
+
bool (*has_wbinvd_exit)(void);
 
/* Returns actual tsc_offset set in active VMCS */
@@ -1378,7 +1394,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
   struct kvm_memory_slot *slot,
   gfn_t gfn_offset, unsigned long mask);
-void kvm_mmu_zap_all(struct kvm *kvm);
+void kvm_mmu_zap_all_active(struct kvm *kvm);
+void kvm_mmu_zap_all_private(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
@@ -1532,7 +1549,9 @@ static inline int __kvm_irq_line_state(unsigned long 
*irq_state,
 
 #define KVM_MMU_ROOT_CURRENT   BIT(0)
 #define KVM_MMU_ROOT_PREVIOUS(i)   BIT(1+i)
-#define KVM_MMU_ROOTS_ALL  (~0UL)
+#define KVM_MMU_ROOT_PRIVATE   BIT(1+KVM_MMU_NUM_PREV_ROOTS)
+#define KVM_MMU_ROOTS_ALL  ((u32)(~KVM_MMU_ROOT_PRIVATE))
+#define KVM_MMU_ROOTS_ALL_INC_PRIVATE  (KVM_MMU_ROOTS_ALL | 
KVM_MMU_ROOT_PRIVATE)
 
 int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int 
level);
 void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e9598a51090

[RFC PATCH 59/67] KVM: VMX: Move AR_BYTES encoder/decoder helpers to common.h

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Move the AR_BYTES helpers to common.h so that future patches can reuse
them to decode/encode AR for TDX.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/common.h | 41 ++
 arch/x86/kvm/vmx/vmx.c| 46 ---
 2 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index baee96abdd7e..ad106364c51f 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -4,6 +4,7 @@
 
 #include 
 
+#include 
 #include 
 #include 
 
@@ -121,4 +122,44 @@ static inline int __vmx_handle_ept_violation(struct 
kvm_vcpu *vcpu, gpa_t gpa,
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
+static inline u32 vmx_encode_ar_bytes(struct kvm_segment *var)
+{
+   u32 ar;
+
+   if (var->unusable || !var->present)
+   ar = 1 << 16;
+   else {
+   ar = var->type & 15;
+   ar |= (var->s & 1) << 4;
+   ar |= (var->dpl & 3) << 5;
+   ar |= (var->present & 1) << 7;
+   ar |= (var->avl & 1) << 12;
+   ar |= (var->l & 1) << 13;
+   ar |= (var->db & 1) << 14;
+   ar |= (var->g & 1) << 15;
+   }
+
+   return ar;
+}
+
+static inline void vmx_decode_ar_bytes(u32 ar, struct kvm_segment *var)
+{
+   var->unusable = (ar >> 16) & 1;
+   var->type = ar & 15;
+   var->s = (ar >> 4) & 1;
+   var->dpl = (ar >> 5) & 3;
+   /*
+* Some userspaces do not preserve unusable property. Since usable
+* segment has to be present according to VMX spec we can use present
+* property to amend userspace bug by making unusable segment always
+* nonpresent. vmx_encode_ar_bytes() already marks nonpresent
+* segment as unusable.
+*/
+   var->present = !var->unusable;
+   var->avl = (ar >> 12) & 1;
+   var->l = (ar >> 13) & 1;
+   var->db = (ar >> 14) & 1;
+   var->g = (ar >> 15) & 1;
+}
+
 #endif /* __KVM_X86_VMX_COMMON_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 85401a7eef9a..8bd71b91c6f0 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -361,7 +361,6 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops 
= {
 };
 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 
-static u32 vmx_segment_access_rights(struct kvm_segment *var);
 static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu 
*vcpu,
  u32 msr, int type);
 
@@ -2736,7 +2735,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment 
*save)
vmcs_write16(sf->selector, var.selector);
vmcs_writel(sf->base, var.base);
vmcs_write32(sf->limit, var.limit);
-   vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
+   vmcs_write32(sf->ar_bytes, vmx_encode_ar_bytes(&var));
 }
 
 static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -3131,7 +3130,6 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
-   u32 ar;
 
if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
*var = vmx->rmode.segs[seg];
@@ -3145,23 +3143,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct 
kvm_segment *var, int seg)
var->base = vmx_read_guest_seg_base(vmx, seg);
var->limit = vmx_read_guest_seg_limit(vmx, seg);
var->selector = vmx_read_guest_seg_selector(vmx, seg);
-   ar = vmx_read_guest_seg_ar(vmx, seg);
-   var->unusable = (ar >> 16) & 1;
-   var->type = ar & 15;
-   var->s = (ar >> 4) & 1;
-   var->dpl = (ar >> 5) & 3;
-   /*
-* Some userspaces do not preserve unusable property. Since usable
-* segment has to be present according to VMX spec we can use present
-* property to amend userspace bug by making unusable segment always
-* nonpresent. vmx_segment_access_rights() already marks nonpresent
-* segment as unusable.
-*/
-   var->present = !var->unusable;
-   var->avl = (ar >> 12) & 1;
-   var->l = (ar >> 13) & 1;
-   var->db = (ar >> 14) & 1;
-   var->g = (ar >> 15) & 1;
+   vmx_decode_ar_bytes(vmx_read_guest_seg_ar(vmx, seg), var);
 }
 
 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
@@ -3187,26 +3169,6 @@ int vmx_get_cpl(struct kvm_vcpu *vcpu)
}
 }
 
-static u32 vmx_segment_access_rights(struct kvm_segment *var)
-{
-   u32 ar;
-
-   if (var->unusable || !var->present)
-   ar = 1 << 16;
-   else {
-   ar = var->type & 15;
-   ar |= (var->s & 1) << 4;
-   ar |= (var->dpl & 3) << 5;
-   ar |= (var->present & 1) << 7;
-   ar |= (var->avl & 1) << 12;
-

[RFC PATCH 35/67] KVM: x86/mmu: Explicitly check for MMIO spte in fast page fault

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Explicity check for an MMIO spte in the fast page fault flow.  TDX will
use a not-present entry for MMIO sptes, which can be mistaken for an
access-tracked spte since both have SPTE_SPECIAL_MASK set.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 76de8d48165d..c4d657b26066 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3090,7 +3090,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t 
cr2_or_gpa,
break;
 
sp = sptep_to_sp(iterator.sptep);
-   if (!is_last_spte(spte, sp->role.level))
+   if (!is_last_spte(spte, sp->role.level) || is_mmio_spte(spte))
break;
 
/*
-- 
2.17.1

[RFC PATCH 28/67] KVM: x86: Introduce vm_teardown() hook in kvm_arch_vm_destroy()

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add a second kvm_x86_ops hook in kvm_arch_vm_destroy() to support TDX's
destruction path, which needs to first put the VM into a teardown state,
then free per-vCPU resource, and finally free per-VM resources.

Note, this knowingly creates a discrepancy in nomenclature for SVM as
svm_vm_teardown() invokes avic_vm_destroy() and sev_vm_destroy().
Moving the now-misnamed functions or renaming them is left to a future
patch so as not to introduce a functional change for SVM.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm/svm.c  |  8 +++-
 arch/x86/kvm/vmx/vmx.c  | 12 
 arch/x86/kvm/x86.c  |  4 ++--
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 32e995327944..a6c89666ec49 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1100,6 +1100,7 @@ struct kvm_x86_ops {
bool (*is_vm_type_supported)(unsigned long vm_type);
unsigned int vm_size;
int (*vm_init)(struct kvm *kvm);
+   void (*vm_teardown)(struct kvm *kvm);
void (*vm_destroy)(struct kvm *kvm);
 
/* Create, but do not attach this VCPU */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 241a26e1fa71..15836446a9b8 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4155,12 +4155,17 @@ static bool svm_apic_init_signal_blocked(struct 
kvm_vcpu *vcpu)
   (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
 }
 
-static void svm_vm_destroy(struct kvm *kvm)
+static void svm_vm_teardown(struct kvm *kvm)
 {
avic_vm_destroy(kvm);
sev_vm_destroy(kvm);
 }
 
+static void svm_vm_destroy(struct kvm *kvm)
+{
+
+}
+
 static bool svm_is_vm_type_supported(unsigned long type)
 {
return type == KVM_X86_LEGACY_VM;
@@ -4195,6 +4200,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.is_vm_type_supported = svm_is_vm_type_supported,
.vm_size = sizeof(struct kvm_svm),
.vm_init = svm_vm_init,
+   .vm_teardown = svm_vm_teardown,
.vm_destroy = svm_vm_destroy,
 
.prepare_guest_switch = svm_prepare_guest_switch,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 2ee7eb7dac26..3559b51f566d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7010,6 +7010,16 @@ static int vmx_vm_init(struct kvm *kvm)
return 0;
 }
 
+static void vmx_vm_teardown(struct kvm *kvm)
+{
+
+}
+
+static void vmx_vm_destroy(struct kvm *kvm)
+{
+
+}
+
 static int __init vmx_check_processor_compat(void)
 {
struct vmcs_config vmcs_conf;
@@ -7611,6 +7621,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.is_vm_type_supported = vmx_is_vm_type_supported,
.vm_size = sizeof(struct kvm_vmx),
.vm_init = vmx_vm_init,
+   .vm_teardown = vmx_vm_teardown,
+   .vm_destroy = vmx_vm_destroy,
 
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7b8bbdc98492..42bd24ba7fdd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10533,10 +10533,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
mutex_unlock(&kvm->slots_lock);
}
-   if (kvm_x86_ops.vm_destroy)
-   kvm_x86_ops.vm_destroy(kvm);
for (i = 0; i < kvm->arch.msr_filter.count; i++)
kfree(kvm->arch.msr_filter.ranges[i].bitmap);
+   kvm_x86_ops.vm_teardown(kvm);
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
kvm_free_vcpus(kvm);
@@ -10545,6 +10544,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_mmu_uninit_vm(kvm);
kvm_page_track_cleanup(kvm);
kvm_hv_destroy_vm(kvm);
+   kvm_x86_ops.vm_destroy(kvm);
 }
 
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
-- 
2.17.1

[RFC PATCH 45/67] KVM: VMX: Move NMI/exception handler to common helper

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/common.h | 54 +++
 arch/x86/kvm/vmx/vmx.c| 42 +-
 2 files changed, 60 insertions(+), 36 deletions(-)
 create mode 100644 arch/x86/kvm/vmx/common.h

diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
new file mode 100644
index ..146f1da9c88d
--- /dev/null
+++ b/arch/x86/kvm/vmx/common.h
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef __KVM_X86_VMX_COMMON_H
+#define __KVM_X86_VMX_COMMON_H
+
+#include 
+
+#include 
+
+#include "vmcs.h"
+#include "x86.h"
+
+void vmx_handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info);
+
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static inline void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE)
+   struct pt_regs regs = {
+   .cs = 3, /* Fake ring 3 no matter what the guest ran on */
+   .flags = X86_EFLAGS_IF,
+   };
+
+   do_machine_check(®s);
+#endif
+}
+
+static inline void vmx_handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
+   u32 intr_info)
+{
+   if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
+   "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
+   return;
+
+   vmx_handle_interrupt_nmi_irqoff(vcpu, intr_info);
+}
+
+static inline void vmx_handle_exception_nmi_irqoff(struct kvm_vcpu *vcpu,
+ u32 intr_info)
+{
+   /* Handle machine checks before interrupts are enabled */
+   if (is_machine_check(intr_info))
+   kvm_machine_check();
+   /* We need to handle NMIs before interrupts are enabled */
+   else if (is_nmi(intr_info))
+   vmx_handle_interrupt_nmi_irqoff(vcpu, intr_info);
+}
+
+#endif /* __KVM_X86_VMX_COMMON_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5d6c3a50230d..e8b60d447e27 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -49,6 +49,7 @@
 #include 
 
 #include "capabilities.h"
+#include "common.h"
 #include "cpuid.h"
 #include "evmcs.h"
 #include "irq.h"
@@ -4708,25 +4709,6 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
return 1;
 }
 
-/*
- * Trigger machine check on the host. We assume all the MSRs are already set up
- * by the CPU and that we still run on the same CPU as the MCE occurred on.
- * We pass a fake environment to the machine check handler because we want
- * the guest to be always treated like user space, no matter what context
- * it used internally.
- */
-static void kvm_machine_check(void)
-{
-#if defined(CONFIG_X86_MCE)
-   struct pt_regs regs = {
-   .cs = 3, /* Fake ring 3 no matter what the guest ran on */
-   .flags = X86_EFLAGS_IF,
-   };
-
-   do_machine_check(®s);
-#endif
-}
-
 static int handle_machine_check(struct kvm_vcpu *vcpu)
 {
/* handled by vmx_vcpu_run() */
@@ -6348,7 +6330,7 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu 
*vcpu)
 
 void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
 
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+void vmx_handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
 {
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
gate_desc *desc = (gate_desc *)host_idt_base + vector;
@@ -6363,21 +6345,8 @@ static void handle_exception_nmi_irqoff(struct kvm_vcpu 
*vcpu, u32 intr_info)
/* if exit due to PF check for async PF */
if (is_page_fault(intr_info))
vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
-   /* Handle machine checks before interrupts are enabled */
-   else if (is_machine_check(intr_info))
-   kvm_machine_check();
-   /* We need to handle NMIs before interrupts are enabled */
-   else if (is_nmi(intr_info))
-   handle_interrupt_nmi_irqoff(vcpu, intr_info);
-}
-
-static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, u32 
intr_info)
-{
-   if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
-   "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
-   return;
-
-   handle_interrupt_nmi_irqoff(vcpu, intr_info);
+   else
+   vmx_handle_exception_nmi_irqoff(vcpu, intr_info);
 }
 
 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
@@ -6385,7 +6354,8 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
 
if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRU

[RFC PATCH 61/67] KVM: VMX: Move .get_interrupt_shadow() implementation to common VMX code

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/common.h | 14 ++
 arch/x86/kvm/vmx/vmx.c| 10 +-
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index ad106364c51f..8519423bfd88 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -122,6 +122,20 @@ static inline int __vmx_handle_ept_violation(struct 
kvm_vcpu *vcpu, gpa_t gpa,
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
+static inline u32 __vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+   u32 interruptibility;
+   int ret = 0;
+
+   interruptibility = vmread32(vcpu, GUEST_INTERRUPTIBILITY_INFO);
+   if (interruptibility & GUEST_INTR_STATE_STI)
+   ret |= KVM_X86_SHADOW_INT_STI;
+   if (interruptibility & GUEST_INTR_STATE_MOV_SS)
+   ret |= KVM_X86_SHADOW_INT_MOV_SS;
+
+   return ret;
+}
+
 static inline u32 vmx_encode_ar_bytes(struct kvm_segment *var)
 {
u32 ar;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 93b319eacdfa..9c15df71700d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1461,15 +1461,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long 
rflags)
 
 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 {
-   u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-   int ret = 0;
-
-   if (interruptibility & GUEST_INTR_STATE_STI)
-   ret |= KVM_X86_SHADOW_INT_STI;
-   if (interruptibility & GUEST_INTR_STATE_MOV_SS)
-   ret |= KVM_X86_SHADOW_INT_MOV_SS;
-
-   return ret;
+   return __vmx_get_interrupt_shadow(vcpu);
 }
 
 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
-- 
2.17.1

[RFC PATCH 66/67] fixup! KVM: TDX: Add "basic" support for building and running Trust Domains

2020-11-16 Thread isaku . yamahata

From: Isaku Yamahata 

---
 arch/x86/kvm/vmx/tdx.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index adcb866861b7..d2c1766416f2 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -331,9 +331,6 @@ static int tdx_vm_init(struct kvm *kvm)
kvm->arch.mce_injection_disallowed = true;
kvm_mmu_set_mmio_spte_mask(kvm, 0, 0);
 
-   /* TODO: Enable 2mb and 1gb large page support. */
-   kvm->arch.tdp_max_page_level = PG_LEVEL_4K;
-
kvm_apicv_init(kvm, true);
 
/* vCPUs can't be created until after KVM_TDX_INIT_VM. */
-- 
2.17.1

[RFC PATCH 31/67] KVM: x86: Add option to force LAPIC expiration wait

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add an option to skip the IRR check in kvm_wait_lapic_expire().  This
will be used by TDX to wait if there is an outstanding notification for
a TD, i.e. a virtual interrupt is being triggered via posted interrupt
processing.  KVM TDX doesn't emulate PI processing, i.e. there will
never be a bit set in IRR/ISR, so the default behavior for APICv of
querying the IRR doesn't work as intended.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/lapic.c   | 6 +++---
 arch/x86/kvm/lapic.h   | 2 +-
 arch/x86/kvm/svm/svm.c | 2 +-
 arch/x86/kvm/vmx/vmx.c | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e6c0aaf4044e..41dce91f5df0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1601,12 +1601,12 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu 
*vcpu)
adjust_lapic_timer_advance(vcpu, 
apic->lapic_timer.advance_expire_delta);
 }
 
-void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu, bool force_wait)
 {
if (lapic_in_kernel(vcpu) &&
vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
vcpu->arch.apic->lapic_timer.timer_advance_ns &&
-   lapic_timer_int_injected(vcpu))
+   (force_wait || lapic_timer_int_injected(vcpu)))
__kvm_wait_lapic_expire(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
@@ -1642,7 +1642,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, 
bool from_timer_fn)
}
 
if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
-   kvm_wait_lapic_expire(vcpu);
+   kvm_wait_lapic_expire(vcpu, false);
kvm_apic_inject_pending_timer_irqs(apic);
return;
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4fb86e3a9dd3..30f036678f5c 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -237,7 +237,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu 
*vcpu)
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
-void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu, bool force_wait);
 
 void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
  unsigned long *vcpu_bitmap);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 15836446a9b8..8be23240c74f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3580,7 +3580,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu 
*vcpu)
clgi();
kvm_load_guest_xsave_state(vcpu);
 
-   kvm_wait_lapic_expire(vcpu);
+   kvm_wait_lapic_expire(vcpu, false);
 
/*
 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3559b51f566d..deeec105e963 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6720,7 +6720,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
 
-   kvm_wait_lapic_expire(vcpu);
+   kvm_wait_lapic_expire(vcpu, false);
 
/*
 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
-- 
2.17.1

[RFC PATCH 27/67] KVM: x86: Add support for vCPU and device-scoped KVM_MEMORY_ENCRYPT_OP

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c  | 12 
 2 files changed, 14 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 01c78eeefef4..32e995327944 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1276,7 +1276,9 @@ struct kvm_x86_ops {
int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
void (*enable_smi_window)(struct kvm_vcpu *vcpu);
 
+   int (*mem_enc_op_dev)(void __user *argp);
int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
+   int (*mem_enc_op_vcpu)(struct kvm_vcpu *vcpu, void __user *argp);
int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region 
*argp);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 22e956f01ddc..7b8bbdc98492 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3882,6 +3882,12 @@ long kvm_arch_dev_ioctl(struct file *filp,
case KVM_GET_MSRS:
r = msr_io(NULL, argp, do_get_msr_feature, 1);
break;
+   case KVM_MEMORY_ENCRYPT_OP:
+   r = -EINVAL;
+   if (!kvm_x86_ops.mem_enc_op_dev)
+   goto out;
+   r = kvm_x86_ops.mem_enc_op_dev(argp);
+   break;
default:
r = -EINVAL;
break;
@@ -5020,6 +5026,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = 0;
break;
}
+   case KVM_MEMORY_ENCRYPT_OP:
+   r = -EINVAL;
+   if (!kvm_x86_ops.mem_enc_op_vcpu)
+   goto out;
+   r = kvm_x86_ops.mem_enc_op_vcpu(vcpu, argp);
+   break;
default:
r = -EINVAL;
}
-- 
2.17.1

[RFC PATCH 37/67] KVM: x86/mmu: Ignore bits 63 and 62 when checking for "present" SPTEs

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Ignore bits 63 and 62 when checking for present SPTEs to allow setting
said bits in not-present SPTEs.  TDX will set bit 63 in "zero" SPTEs to
suppress #VEs (TDX-SEAM unconditionally enables EPT Violation #VE), and
will use bit 62 to track zapped private SPTEs.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu/paging_tmpl.h |  2 +-
 arch/x86/kvm/mmu/spte.h| 17 +++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 5d4e9f404018..06659d5c8ba0 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -1039,7 +1039,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp)
gpa_t pte_gpa;
gfn_t gfn;
 
-   if (!sp->spt[i])
+   if (!__is_shadow_present_pte(sp->spt[i]))
continue;
 
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index e5c94848ade1..22256cc8cce6 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -174,9 +174,22 @@ static inline bool is_access_track_spte(u64 spte)
return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
 }
 
-static inline int is_shadow_present_pte(u64 pte)
+static inline bool __is_shadow_present_pte(u64 pte)
 {
-   return (pte != 0) && !is_mmio_spte(pte);
+   /*
+* Ignore bits 63 and 62 so that they can be set in SPTEs that are well
+* and truly not present.  We can't use the sane/obvious approach of
+* querying bits 2:0 (RWX or P) because EPT without A/D bits will clear
+* RWX of a "present" SPTE to do access tracking.  Tracking updates can
+* be done out of mmu_lock, so even the flushing logic needs to treat
+* such SPTEs as present.
+*/
+   return !!(pte << 2);
+}
+
+static inline bool is_shadow_present_pte(u64 pte)
+{
+   return __is_shadow_present_pte(pte) && !is_mmio_spte(pte);
 }
 
 static inline int is_large_pte(u64 pte)
-- 
2.17.1

[RFC PATCH 58/67] KVM: VMX: Add macro framework to read/write VMCS for VMs and TDs

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add a macro framework to hide VMX vs. TDX details of VMREAD and VMWRITE
so the VMX and TDX can shared common flows, e.g. accessing DTs.

Note, the TDX paths are dead code at this time.  There is no great way
to deal with the chicken-and-egg scenario of having things in place for
TDX without first having TDX.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/common.h | 41 +++
 1 file changed, 41 insertions(+)

diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index 58edf1296cbd..baee96abdd7e 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -11,6 +11,47 @@
 #include "vmcs.h"
 #include "vmx.h"
 #include "x86.h"
+#include "tdx.h"
+
+#ifdef CONFIG_KVM_INTEL_TDX
+#define VT_BUILD_VMCS_HELPERS(type, bits, tdbits) \
+static __always_inline type vmread##bits(struct kvm_vcpu *vcpu,
   \
+unsigned long field)  \
+{ \
+   if (unlikely(is_td_vcpu(vcpu))) {  \
+   if (KVM_BUG_ON(!is_debug_td(vcpu), vcpu->kvm)) \
+   return 0;  \
+   return td_vmcs_read##tdbits(to_tdx(vcpu), field);  \
+   }  \
+   return vmcs_read##bits(field); \
+} \
+static __always_inline void vmwrite##bits(struct kvm_vcpu *vcpu,  \
+ unsigned long field, type value) \
+{ \
+   if (unlikely(is_td_vcpu(vcpu))) {  \
+   if (KVM_BUG_ON(!is_debug_td(vcpu), vcpu->kvm)) \
+   return;\
+   return td_vmcs_write##tdbits(to_tdx(vcpu), field, value);  \
+   }  \
+   vmcs_write##bits(field, value);\
+}
+#else
+#define VT_BUILD_VMCS_HELPERS(type, bits, tdbits) \
+static __always_inline type vmread##bits(struct kvm_vcpu *vcpu,
   \
+unsigned long field)  \
+{ \
+   return vmcs_read##bits(field); \
+} \
+static __always_inline void vmwrite##bits(struct kvm_vcpu *vcpu,  \
+ unsigned long field, type value) \
+{ \
+   vmcs_write##bits(field, value);\
+}
+#endif /* CONFIG_KVM_INTEL_TDX */
+VT_BUILD_VMCS_HELPERS(u16, 16, 16);
+VT_BUILD_VMCS_HELPERS(u32, 32, 32);
+VT_BUILD_VMCS_HELPERS(u64, 64, 64);
+VT_BUILD_VMCS_HELPERS(unsigned long, l, 64);
 
 void vmx_handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info);
 
-- 
2.17.1

[RFC PATCH 51/67] KVM: VMX: Move register caching logic to common code

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Move the guts of vmx_cache_reg() to vt_cache_reg() in preparation for
reusing the bulk of the code for TDX, which can access guest state for
debug TDs.

Use kvm_x86_ops.cache_reg() in ept_update_paging_mode_cr0() rather than
trying to expose vt_cache_reg() to vmx.c, even though it means taking a
retpoline.  The code runs if and only if EPT is enabled but unrestricted
guest.  Only one generation of CPU, Nehalem, supports EPT but not
unrestricted guest, and disabling unrestricted guest without also
disabling EPT is, to put it bluntly, dumb.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/main.c | 37 +++-
 arch/x86/kvm/vmx/vmx.c  | 42 +
 2 files changed, 37 insertions(+), 42 deletions(-)

diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 52e7a9d25e9c..30b1815fd5a7 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -347,7 +347,42 @@ static void vt_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
 
 static void vt_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 {
-   vmx_cache_reg(vcpu, reg);
+   unsigned long guest_owned_bits;
+
+   kvm_register_mark_available(vcpu, reg);
+
+   switch (reg) {
+   case VCPU_REGS_RSP:
+   vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+   break;
+   case VCPU_REGS_RIP:
+   vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
+   break;
+   case VCPU_EXREG_PDPTR:
+   if (enable_ept)
+   ept_save_pdptrs(vcpu);
+   break;
+   case VCPU_EXREG_CR0:
+   guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+
+   vcpu->arch.cr0 &= ~guest_owned_bits;
+   vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
+   break;
+   case VCPU_EXREG_CR3:
+   if (is_unrestricted_guest(vcpu) ||
+   (enable_ept && is_paging(vcpu)))
+   vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+   break;
+   case VCPU_EXREG_CR4:
+   guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+   vcpu->arch.cr4 &= ~guest_owned_bits;
+   vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
+   break;
+   default:
+   KVM_BUG_ON(1, vcpu->kvm);
+   break;
+   }
 }
 
 static unsigned long vt_get_rflags(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f6b2ddff58e1..85401a7eef9a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2211,46 +2211,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
return ret;
 }
 
-static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
-{
-   unsigned long guest_owned_bits;
-
-   kvm_register_mark_available(vcpu, reg);
-
-   switch (reg) {
-   case VCPU_REGS_RSP:
-   vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
-   break;
-   case VCPU_REGS_RIP:
-   vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
-   break;
-   case VCPU_EXREG_PDPTR:
-   if (enable_ept)
-   ept_save_pdptrs(vcpu);
-   break;
-   case VCPU_EXREG_CR0:
-   guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
-
-   vcpu->arch.cr0 &= ~guest_owned_bits;
-   vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
-   break;
-   case VCPU_EXREG_CR3:
-   if (is_unrestricted_guest(vcpu) ||
-   (enable_ept && is_paging(vcpu)))
-   vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
-   break;
-   case VCPU_EXREG_CR4:
-   guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
-
-   vcpu->arch.cr4 &= ~guest_owned_bits;
-   vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
-   break;
-   default:
-   KVM_BUG_ON(1, vcpu->kvm);
-   break;
-   }
-}
-
 static __init int vmx_disabled_by_bios(void)
 {
return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
@@ -2976,7 +2936,7 @@ static void ept_update_paging_mode_cr0(unsigned long 
*hw_cr0,
struct vcpu_vmx *vmx = to_vmx(vcpu);
 
if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
-   vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
+   kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR3);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
-- 
2.17.1

[RFC PATCH 18/67] KVM: x86: Add per-VM flag to disable direct IRQ injection

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add a flag to disable IRQ injection, which is not supported by TDX.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/x86.c  | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e687a8bd46ad..e8180a1fe610 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -995,6 +995,7 @@ struct kvm_arch {
} msr_filter;
 
bool guest_state_protected;
+   bool irq_injection_disallowed;
 
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6154abecd546..ec66d5d53a1a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4041,7 +4041,8 @@ static int kvm_vcpu_ready_for_interrupt_injection(struct 
kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
 {
-   if (irq->irq >= KVM_NR_INTERRUPTS)
+   if (irq->irq >= KVM_NR_INTERRUPTS ||
+   vcpu->kvm->arch.irq_injection_disallowed)
return -EINVAL;
 
if (!irqchip_in_kernel(vcpu->kvm)) {
@@ -8170,6 +8171,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt 
*ctxt)
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
 {
return vcpu->run->request_interrupt_window &&
+  !vcpu->kvm->arch.irq_injection_disallowed &&
likely(!pic_in_kernel(vcpu->kvm));
 }
 
-- 
2.17.1

[RFC PATCH 52/67] KVM: TDX: Add TDX "architectural" error codes

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

TDX-SEAM uses bits 31:0 to return more information, so these error codes
will only exactly match RAX[63:32].

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/tdx_errno.h | 91 
 1 file changed, 91 insertions(+)
 create mode 100644 arch/x86/kvm/vmx/tdx_errno.h

diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h
new file mode 100644
index ..802ddc169d58
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx_errno.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_TDX_ERRNO_H
+#define __KVM_X86_TDX_ERRNO_H
+
+/*
+ * TDX SEAMCALL Status Codes (returned in RAX)
+ */
+#define TDX_SUCCESS0x
+#define TDX_NON_RECOVERABLE_VCPU   0x4001
+#define TDX_NON_RECOVERABLE_TD 0x4002
+#define TDX_INTERRUPTED_RESUMABLE  0x8003
+#define TDX_INTERRUPTED_RESTARTABLE0x8004
+#define TDX_OPERAND_INVALID0xC100
+#define TDX_OPERAND_ADDR_RANGE_ERROR   0xC101
+#define TDX_OPERAND_BUSY   0x8200
+#define TDX_PREVIOUS_TLB_EPOCH_BUSY0x8201
+#define TDX_SYS_BUSY   0x8202
+#define TDX_OPERAND_PAGE_METADATA_INCORRECT0xC300
+#define TDX_PAGE_ALREADY_FREE  0x0301
+#define TDX_TD_ASSOCIATED_PAGES_EXIST  0xC400
+#define TDX_SYSINIT_NOT_PENDING0xC500
+#define TDX_SYSINIT_NOT_DONE   0xC501
+#define TDX_SYSINITLP_NOT_DONE 0xC502
+#define TDX_SYSINITLP_DONE 0xC503
+#define TDX_SYSCONFIGKEY_NOT_DONE  0xC504
+#define TDX_SYS_NOT_READY  0xC505
+#define TDX_SYS_SHUTDOWN   0xC506
+#define TDX_SYSCONFIG_NOT_DONE 0xC507
+#define TDX_TD_NOT_INITIALIZED 0xC600
+#define TDX_TD_INITIALIZED 0xC601
+#define TDX_TD_NOT_FINALIZED   0xC602
+#define TDX_TD_FINALIZED   0xC603
+#define TDX_TD_FATAL   0xC604
+#define TDX_TD_NON_DEBUG   0xC605
+#define TDX_TDCX_NUM_INCORRECT 0xC610
+#define TDX_VCPU_STATE_INCORRECT   0xC700
+#define TDX_VCPU_ASSOCIATED0x8701
+#define TDX_VCPU_NOT_ASSOCIATED0x8702
+#define TDX_TDVPX_NUM_INCORRECT0xC703
+#define TDX_NO_VALID_VE_INFO   0xC704
+#define TDX_MAX_VCPUS_EXCEEDED 0xC705
+#define TDX_TDVPS_FIELD_NOT_WRITABLE   0xC720
+#define TDX_TDVPS_FIELD_NOT_READABLE   0xC721
+#define TDX_TD_VMCS_FIELD_NOT_INITIALIZED  0xC730
+#define TDX_KEY_GENERATION_FAILED  0x8800
+#define TDX_TD_KEYS_NOT_CONFIGURED 0x8810
+#define TDX_KEY_STATE_INCORRECT0xC811
+#define TDX_KEY_CONFIGURED 0x0815
+#define TDX_WBCACHE_NOT_COMPLETE   0x8817
+#define TDX_HKID_NOT_FREE  0xC820
+#define TDX_NO_HKID_READY_TO_WBCACHE   0x0821
+#define TDX_WBCACHE_RESUME_ERROR   0xC823
+#define TDX_FLUSHVP_NOT_DONE   0x8824
+#define TDX_NUM_ACTIVATED_HKIDS_NOT_SUPPORRTED 0xC825
+#define TDX_INCORRECT_CPUID_VALUE  0xC900
+#define TDX_BOOT_NT4_SET   0xC901
+#define TDX_INCONSISTENT_CPUID_FIELD   0xC902
+#define TDX_CPUID_LEAF_1F_NOT_SUPPORTED0xC903
+#define TDX_CPUID_LEAF_1F_FORMAT_UNRECOGNIZED  0xC904
+#define TDX_INVALID_WBINVD_SCOPE   0xC905
+#define TDX_INVALID_PKG_ID 0xC906
+#define TDX_SMRR_NOT_LOCKED0xC910
+#define TDX_INVALID_SMRR_CONFIGURATION 0xC911
+#define TDX_SMRR_OVERLAPS_CMR  0xC912
+#define TDX_SMRR_LOCK_NOT_SUPPORTED0xC913
+#define TDX_SMRR_NOT_SUPPORTED 0xC914
+#define TDX_INCONSISTENT_MSR   0xC920
+#define TDX_INCORRECT_MSR_VALUE0xC921
+#define TDX_SEAMREPORT_NOT_AVAILABLE   0xC930
+#define TDX_INVALID_TDMR   0xCA00
+#define TDX_NON_ORDERED_TDMR

[RFC PATCH 38/67] KVM: x86/mmu: Allow non-zero init value for shadow PTE

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

TDX will run with EPT violation #VEs enabled, which means KVM needs to
set the "suppress #VE" bit in unused PTEs to avoid unintentionally
reflecting not-present EPT violations into the guest.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.h  |  1 +
 arch/x86/kvm/mmu/mmu.c  | 50 +++--
 arch/x86/kvm/mmu/spte.c | 10 +
 arch/x86/kvm/mmu/spte.h |  2 ++
 4 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 05c2898cb2a2..e9598a51090b 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -55,6 +55,7 @@ static inline u64 rsvd_bits(int s, int e)
 void kvm_mmu_set_mmio_spte_mask(struct kvm *kvm, u64 mmio_value,
u64 access_mask);
 void kvm_mmu_set_default_mmio_spte_mask(u64 mask);
+void kvm_mmu_set_spte_init_value(u64 init_value);
 
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index da2a58fa86a8..732510ecda36 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -560,9 +560,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
u64 old_spte = *sptep;
 
if (!spte_has_volatile_bits(old_spte))
-   __update_clear_spte_fast(sptep, 0ull);
+   __update_clear_spte_fast(sptep, shadow_init_value);
else
-   old_spte = __update_clear_spte_slow(sptep, 0ull);
+   old_spte = __update_clear_spte_slow(sptep, shadow_init_value);
 
if (!is_shadow_present_pte(old_spte))
return 0;
@@ -592,7 +592,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
  */
 static void mmu_spte_clear_no_track(u64 *sptep)
 {
-   __update_clear_spte_fast(sptep, 0ull);
+   __update_clear_spte_fast(sptep, shadow_init_value);
 }
 
 static u64 mmu_spte_get_lockless(u64 *sptep)
@@ -670,6 +670,42 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu 
*vcpu)
local_irq_enable();
 }
 
+static inline void kvm_init_shadow_page(void *page)
+{
+#ifdef CONFIG_X86_64
+   int ign;
+
+   asm volatile (
+   "rep stosq\n\t"
+   : "=c"(ign), "=D"(page)
+   : "a"(shadow_init_value), "c"(4096/8), "D"(page)
+   : "memory"
+   );
+#else
+   BUG();
+#endif
+}
+
+static int mmu_topup_shadow_page_cache(struct kvm_vcpu *vcpu)
+{
+   struct kvm_mmu_memory_cache *mc = &vcpu->arch.mmu_shadow_page_cache;
+   int start, end, i, r;
+
+   if (shadow_init_value)
+   start = kvm_mmu_memory_cache_nr_free_objects(mc);
+
+   r = kvm_mmu_topup_memory_cache(mc, PT64_ROOT_MAX_LEVEL);
+   if (r)
+   return r;
+
+   if (shadow_init_value) {
+   end = kvm_mmu_memory_cache_nr_free_objects(mc);
+   for (i = start; i < end; i++)
+   kvm_init_shadow_page(mc->objects[i]);
+   }
+   return 0;
+}
+
 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 {
int r;
@@ -679,8 +715,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, 
bool maybe_indirect)
   1 + PT64_ROOT_MAX_LEVEL + 
PTE_PREFETCH_NUM);
if (r)
return r;
-   r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
-  PT64_ROOT_MAX_LEVEL);
+   r = mmu_topup_shadow_page_cache(vcpu);
if (r)
return r;
if (maybe_indirect) {
@@ -3074,7 +3109,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t 
cr2_or_gpa,
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
int ret = RET_PF_INVALID;
-   u64 spte = 0ull;
+   u64 spte = shadow_init_value;
uint retry_count = 0;
 
if (!page_fault_can_be_fast(error_code))
@@ -5356,7 +5391,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
 
-   vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+   if (!shadow_init_value)
+   vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
 
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 574c8ccac0bf..079bbef7b8aa 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -27,6 +27,7 @@ u64 __read_mostly shadow_mmio_access_mask;
 u64 __read_mostly shadow_present_mask;
 u64 __read_mostly shadow_me_mask;
 u64 __read_mostly shadow_acc_track_mask;
+u64 __read_mostly shadow_init_value;
 
 u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
 u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
@@ -195,6 +196,14 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, 
kvm_pfn

[RFC PATCH 54/67] KVM: TDX: Define TDCALL exit reason

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Define the TDCALL exit reason, which is carved out from the VMX exit
reason namespace as the TDCALL exit from TDX guest to TDX-SEAM is really
just a VM-Exit.

Co-developed-by: Xiaoyao Li 
Signed-off-by: Xiaoyao Li 
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/uapi/asm/vmx.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b8ff9e8ac0d5..95fd84bd909a 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -88,6 +88,7 @@
 #define EXIT_REASON_XRSTORS 64
 #define EXIT_REASON_UMWAIT  67
 #define EXIT_REASON_TPAUSE  68
+#define EXIT_REASON_TDCALL  77
 
 #define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -148,7 +149,8 @@
{ EXIT_REASON_XSAVES,"XSAVES" }, \
{ EXIT_REASON_XRSTORS,   "XRSTORS" }, \
{ EXIT_REASON_UMWAIT,"UMWAIT" }, \
-   { EXIT_REASON_TPAUSE,"TPAUSE" }
+   { EXIT_REASON_TPAUSE,"TPAUSE" }, \
+   { EXIT_REASON_TDCALL,"TDCALL" }
 
 #define VMX_EXIT_REASON_FLAGS \
{ VMX_EXIT_REASONS_FAILED_VMENTRY,  "FAILED_VMENTRY" }
-- 
2.17.1

[RFC PATCH 64/67] KVM: TDX: Add "basic" support for building and running Trust Domains

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add what is effectively a TDX-specific ioctl for initializing the guest
Trust Domain.  Implement the functionality as a subcommand of
KVM_MEMORY_ENCRYPT_OP, analogous to how the ioctl is used by SVM to
manage SEV guests.

For easy compatibility with future versions of TDX-SEAM, add a
KVM-defined struct, tdx_capabilities, to track requirements/capabilities
for the overall system, and define a global instance to serve as the
canonical reference.

Co-developed-by: Xiaoyao Li 
Signed-off-by: Xiaoyao Li 
Co-developed-by: Kai Huang 
Signed-off-by: Kai Huang 
Co-developed-by: Isaku Yamahata 
Signed-off-by: Isaku Yamahata 
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/uapi/asm/kvm.h   |   51 +
 arch/x86/kvm/trace.h  |   57 +
 arch/x86/kvm/vmx/common.h |1 +
 arch/x86/kvm/vmx/main.c   |  384 -
 arch/x86/kvm/vmx/posted_intr.c|6 +
 arch/x86/kvm/vmx/tdx.c| 1850 +
 arch/x86/kvm/vmx/tdx.h|   78 ++
 arch/x86/kvm/vmx/tdx_ops.h|   13 +
 arch/x86/kvm/vmx/tdx_stubs.c  |   45 +
 arch/x86/kvm/vmx/vmenter.S|  140 ++
 arch/x86/kvm/x86.c|5 +-
 tools/arch/x86/include/uapi/asm/kvm.h |   51 +
 12 files changed, 2666 insertions(+), 15 deletions(-)
 create mode 100644 arch/x86/kvm/vmx/tdx.c
 create mode 100644 arch/x86/kvm/vmx/tdx_stubs.c

diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 29cdf262e516..03f7bcc3fb85 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -490,4 +490,55 @@ struct kvm_pmu_event_filter {
 #define KVM_X86_SEV_ES_VM  1
 #define KVM_X86_TDX_VM 2
 
+/* Trust Domain eXtension sub-ioctl() commands. */
+enum tdx_cmd_id {
+   KVM_TDX_CAPABILITIES = 0,
+   KVM_TDX_INIT_VM,
+   KVM_TDX_INIT_VCPU,
+   KVM_TDX_INIT_MEM_REGION,
+   KVM_TDX_FINALIZE_VM,
+
+   KVM_TDX_CMD_NR_MAX,
+};
+
+struct kvm_tdx_cmd {
+   __u32 id;
+   __u32 metadata;
+   __u64 data;
+};
+
+struct kvm_tdx_cpuid_config {
+   __u32 leaf;
+   __u32 sub_leaf;
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+};
+
+struct kvm_tdx_capabilities {
+   __u64 attrs_fixed0;
+   __u64 attrs_fixed1;
+   __u64 xfam_fixed0;
+   __u64 xfam_fixed1;
+
+   __u32 nr_cpuid_configs;
+   struct kvm_tdx_cpuid_config cpuid_configs[0];
+};
+
+struct kvm_tdx_init_vm {
+   __u32 max_vcpus;
+   __u32 reserved;
+   __u64 attributes;
+   __u64 cpuid;
+};
+
+#define KVM_TDX_MEASURE_MEMORY_REGION  (1UL << 0)
+
+struct kvm_tdx_init_mem_region {
+   __u64 source_addr;
+   __u64 gpa;
+   __u64 nr_pages;
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index aef960f90f26..e2d9e5caecc8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -623,6 +623,63 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
  __entry->exit_int_info, __entry->exit_int_info_err)
 );
 
+/*
+ * Tracepoint for TDVMCALL from a TDX guest
+ */
+TRACE_EVENT(kvm_tdvmcall,
+   TP_PROTO(struct kvm_vcpu *vcpu, __u32 exit_reason,
+__u64 p1, __u64 p2, __u64 p3, __u64 p4),
+   TP_ARGS(vcpu, exit_reason, p1, p2, p3, p4),
+
+   TP_STRUCT__entry(
+   __field(__u64,  rip )
+   __field(__u32,  exit_reason )
+   __field(__u64,  p1  )
+   __field(__u64,  p2  )
+   __field(__u64,  p3  )
+   __field(__u64,  p4  )
+   ),
+
+   TP_fast_assign(
+   __entry->rip= kvm_rip_read(vcpu);
+   __entry->exit_reason= exit_reason;
+   __entry->p1 = p1;
+   __entry->p2 = p2;
+   __entry->p3 = p3;
+   __entry->p4 = p4;
+   ),
+
+   TP_printk("rip: %llx reason: %s p1: %llx p2: %llx p3: %llx p4: %llx",
+ __entry->rip,
+ __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS),
+ __entry->p1, __entry->p2, __entry->p3, __entry->p4)
+);
+
+/*
+ * Tracepoint for SEPT related SEAMCALLs.
+ */
+TRACE_EVENT(kvm_sept_seamcall,
+   TP_PROTO(__u64 op, __u64 gpa, __u64 hpa, int level),
+   TP_ARGS(op, gpa, hpa, level),
+
+   TP_STRUCT__entry(
+   __field(__u64,  op  )
+   __field(__u64,  gpa )
+   __field(__u64,  hpa )
+   __field(int,level   )
+   ),
+
+   TP_fast_assign(
+   __e

[RFC PATCH 02/67] x86/msr-index: Define MSR_IA32_MKTME_KEYID_PART used by TDX

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Define MSR_IA32_MKTME_KEYID_PART, used by TDX to enumerate the TDX KeyID
space, which is carved out from the regular MKTME KeyIDs.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/msr-index.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 972a34d93505..aad12236b33c 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -628,6 +628,8 @@
 #define MSR_IA32_UCODE_WRITE   0x0079
 #define MSR_IA32_UCODE_REV 0x008b
 
+#define MSR_IA32_MKTME_KEYID_PART  0x0087
+
 #define MSR_IA32_SMM_MONITOR_CTL   0x009b
 #define MSR_IA32_SMBASE0x009e
 
-- 
2.17.1

[RFC PATCH 50/67] KVM: VMX: Move setting of EPT MMU masks to common VT-x code

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/main.c | 17 +
 arch/x86/kvm/vmx/vmx.c  | 13 -
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 85bc238c0852..52e7a9d25e9c 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -26,6 +26,20 @@ static int __init vt_check_processor_compatibility(void)
return 0;
 }
 
+static __init void vt_set_ept_masks(void)
+{
+   const u64 u_mask = VMX_EPT_READABLE_MASK;
+   const u64 a_mask = enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
+   const u64 d_mask = enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
+   const u64 p_mask = cpu_has_vmx_ept_execute_only() ? 0ull :
+   
VMX_EPT_READABLE_MASK;
+   const u64 x_mask = VMX_EPT_EXECUTABLE_MASK;
+   const u64 nx_mask = 0ull;
+
+   kvm_mmu_set_mask_ptes(u_mask, a_mask, d_mask, nx_mask, x_mask, p_mask,
+ VMX_EPT_RWX_MASK, 0ull);
+}
+
 static __init int vt_hardware_setup(void)
 {
int ret;
@@ -34,6 +48,9 @@ static __init int vt_hardware_setup(void)
if (ret)
return ret;
 
+   if (enable_ept)
+   vt_set_ept_masks();
+
return 0;
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 966d48eada40..f6b2ddff58e1 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5411,16 +5411,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
 }
 
-static void vmx_enable_tdp(void)
-{
-   kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
-   enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
-   enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
-   0ull, VMX_EPT_EXECUTABLE_MASK,
-   cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-   VMX_EPT_RWX_MASK, 0ull);
-}
-
 /*
  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -7602,9 +7592,6 @@ static __init int hardware_setup(struct kvm_x86_ops 
*x86_ops)
 
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
-   if (enable_ept)
-   vmx_enable_tdp();
-
if (!enable_ept)
ept_lpage_level = 0;
else if (cpu_has_vmx_ept_1g_page())
-- 
2.17.1

[RFC PATCH 05/67] KVM: Enable hardware before doing arch VM initialization

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Swap the order of hardware_enable_all() and kvm_arch_init_vm() to
accommodate Intel's TDX, which needs VMX to be enabled during VM init in
order to make SEAMCALLs.

This also provides consistent ordering between kvm_create_vm() and
kvm_destroy_vm() with respect to calling kvm_arch_destroy_vm() and
hardware_disable_all().

Signed-off-by: Sean Christopherson 
---
 virt/kvm/kvm_main.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 65e1737c4354..11166e901582 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -764,7 +764,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
struct kvm_memslots *slots = kvm_alloc_memslots();
 
if (!slots)
-   goto out_err_no_arch_destroy_vm;
+   goto out_err_no_disable;
/* Generations must be different for each address space. */
slots->generation = i;
rcu_assign_pointer(kvm->memslots[i], slots);
@@ -774,19 +774,19 @@ static struct kvm *kvm_create_vm(unsigned long type)
rcu_assign_pointer(kvm->buses[i],
kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
if (!kvm->buses[i])
-   goto out_err_no_arch_destroy_vm;
+   goto out_err_no_disable;
}
 
kvm->max_halt_poll_ns = halt_poll_ns;
 
-   r = kvm_arch_init_vm(kvm, type);
-   if (r)
-   goto out_err_no_arch_destroy_vm;
-
r = hardware_enable_all();
if (r)
goto out_err_no_disable;
 
+   r = kvm_arch_init_vm(kvm, type);
+   if (r)
+   goto out_err_no_arch_destroy_vm;
+
 #ifdef CONFIG_HAVE_KVM_IRQFD
INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 #endif
@@ -813,10 +813,10 @@ static struct kvm *kvm_create_vm(unsigned long type)
mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
 #endif
 out_err_no_mmu_notifier:
-   hardware_disable_all();
-out_err_no_disable:
kvm_arch_destroy_vm(kvm);
 out_err_no_arch_destroy_vm:
+   hardware_disable_all();
+out_err_no_disable:
WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm_get_bus(kvm, i));
-- 
2.17.1

[RFC PATCH 01/67] x86/cpufeatures: Add synthetic feature flag for TDX (in host)

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index dad350d42ecf..1bd2a414dcc0 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -230,6 +230,7 @@
 #define X86_FEATURE_FLEXPRIORITY   ( 8*32+ 2) /* Intel FlexPriority */
 #define X86_FEATURE_EPT( 8*32+ 3) /* Intel Extended 
Page Table */
 #define X86_FEATURE_VPID   ( 8*32+ 4) /* Intel Virtual Processor 
ID */
+#define X86_FEATURE_TDX( 8*32+ 5) /* Intel Trusted 
Domain Extensions */
 
 #define X86_FEATURE_VMMCALL( 8*32+15) /* Prefer VMMCALL to VMCALL 
*/
 #define X86_FEATURE_XENPV  ( 8*32+16) /* "" Xen paravirtual guest 
*/
-- 
2.17.1

[RFC PATCH 23/67] KVM: Add per-VM flag to disable dirty logging of memslots for TDs

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add a flag for TDX to mark dirty logging as unsupported.

Suggested-by: Kai Huang 
Signed-off-by: Sean Christopherson 
---
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c  | 5 -
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1a0df7b83fd0..9682282cb258 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -517,6 +517,7 @@ struct kvm {
pid_t userspace_pid;
unsigned int max_halt_poll_ns;
 
+   bool dirty_log_unsupported;
 #ifdef __KVM_HAVE_READONLY_MEM
bool readonly_mem_unsupported;
 #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 572a66a61c29..aa5f27753756 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1103,7 +1103,10 @@ static void update_memslots(struct kvm_memslots *slots,
 static int check_memory_region_flags(struct kvm *kvm,
 const struct kvm_userspace_memory_region 
*mem)
 {
-   u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+   u32 valid_flags = 0;
+
+   if (!kvm->dirty_log_unsupported)
+   valid_flags |= KVM_MEM_LOG_DIRTY_PAGES;
 
 #ifdef __KVM_HAVE_READONLY_MEM
if (!kvm->readonly_mem_unsupported)
-- 
2.17.1

[RFC PATCH 43/67] KVM: x86/mmu: Introduce kvm_mmu_map_tdp_page() for use by TDX

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Introduce a helper to directly (pun intented) fault-in a TDP page
without having to go through the full page fault path.  This allows
TDX to get the resulting pfn and also allows the RET_PF_* enums to
stay in mmu.c where they belong.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu.h |  3 +++
 arch/x86/kvm/mmu/mmu.c | 25 +
 2 files changed, 28 insertions(+)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3b1243cfc280..a6bb930d1549 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -115,6 +115,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu 
*vcpu, gpa_t cr2_or_gpa,
return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault);
 }
 
+kvm_pfn_t kvm_mmu_map_tdp_page(struct kvm_vcpu *vcpu, gpa_t gpa,
+  u32 error_code, int max_level);
+
 /*
  * Currently, we have two sorts of write-protection, a) the first one
  * write-protects guest page to sync the guest modification, b) another one is
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 474173bceb54..bb59e80ade81 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4034,6 +4034,31 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, 
u32 error_code,
 max_level, true, &pfn);
 }
 
+kvm_pfn_t kvm_mmu_map_tdp_page(struct kvm_vcpu *vcpu, gpa_t gpa,
+  u32 error_code, int max_level)
+{
+   kvm_pfn_t pfn;
+   int r;
+
+   if (mmu_topup_memory_caches(vcpu, false))
+   return KVM_PFN_ERR_FAULT;
+
+   /*
+* Loop on the page fault path to handle the case where an mmu_notifier
+* invalidation triggers RET_PF_RETRY.  In the normal page fault path,
+* KVM needs to resume the guest in case the invalidation changed any
+* of the page fault properties, i.e. the gpa or error code.  For this
+* path, the gpa and error code are fixed by the caller, and the caller
+* expects failure if and only if the page fault can't be fixed.
+*/
+   do {
+   r = direct_page_fault(vcpu, gpa, error_code, false, max_level,
+ true, &pfn);
+   } while (r == RET_PF_RETRY && !is_error_noslot_pfn(pfn));
+   return pfn;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_map_tdp_page);
+
 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
   struct kvm_mmu *context)
 {
-- 
2.17.1

[RFC PATCH 21/67] KVM: x86: Add flag to mark TSC as immutable (for TDX)

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

The TSC for TDX1 guests is fixed at TD creation time.  Add tsc_immutable
to reflect that the TSC of the guest cannot be changed in any way, and
use it to short circuit all paths that lead to one of the myriad TSC
adjustment flows.

Suggested-by: Kai Huang 
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c  | 35 +
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 00b34d8f038b..e5b706889d09 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -968,6 +968,7 @@ struct kvm_arch {
int audit_point;
#endif
 
+   bool tsc_immutable;
bool backwards_tsc_observed;
bool boot_vcpu_runs_old_kvmclock;
u32 bsp_vcpu_id;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2f4b226d5b89..01380f057d9f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2040,7 +2040,9 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 
user_tsc_khz, bool scale)
u64 ratio;
 
/* Guest TSC same frequency as host TSC? */
-   if (!scale) {
+   if (!scale || vcpu->kvm->arch.tsc_immutable) {
+   if (scale)
+   pr_warn_ratelimited("Guest TSC immutable, scaling not 
supported\n");
vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
return 0;
}
@@ -2216,6 +2218,9 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, 
u64 data)
bool already_matched;
bool synchronizing = false;
 
+   if (WARN_ON_ONCE(vcpu->kvm->arch.tsc_immutable))
+   return;
+
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
ns = get_kvmclock_base_ns();
@@ -2641,6 +2646,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
u8 pvclock_flags;
bool use_master_clock;
 
+   /* Unable to update guest time if the TSC is immutable. */
+   if (ka->tsc_immutable)
+   return 0;
+
kernel_ns = 0;
host_tsc = 0;
 
@@ -3915,7 +3924,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
 
-   if (kvm_check_tsc_unstable()) {
+   if (kvm_check_tsc_unstable() &&
+   !vcpu->kvm->arch.tsc_immutable) {
u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_vcpu_write_tsc_offset(vcpu, offset);
@@ -3929,7 +3939,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 * On a host with synchronized TSC, there is no need to update
 * kvmclock on vcpu->cpu migration
 */
-   if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
+   if ((!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) &&
+   !vcpu->kvm->arch.tsc_immutable)
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != cpu)
kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
@@ -4888,10 +4899,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_TSC_KHZ: {
-   u32 user_tsc_khz;
+   u32 user_tsc_khz = (u32)arg;
 
r = -EINVAL;
-   user_tsc_khz = (u32)arg;
+   if (vcpu->kvm->arch.tsc_immutable)
+   goto out;
 
if (kvm_has_tsc_control &&
user_tsc_khz >= kvm_max_guest_tsc_khz)
@@ -10013,9 +10025,12 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
if (mutex_lock_killable(&vcpu->mutex))
return;
-   vcpu_load(vcpu);
-   kvm_synchronize_tsc(vcpu, 0);
-   vcpu_put(vcpu);
+
+   if (!kvm->arch.tsc_immutable) {
+   vcpu_load(vcpu);
+   kvm_synchronize_tsc(vcpu, 0);
+   vcpu_put(vcpu);
+   }
 
/* poll control enabled by default */
vcpu->arch.msr_kvm_poll_control = 1;
@@ -10209,6 +10224,10 @@ int kvm_arch_hardware_enable(void)
if (backwards_tsc) {
u64 delta_cyc = max_tsc - local_tsc;
list_for_each_entry(kvm, &vm_list, vm_list) {
+   if (vcpu->kvm->arch.tsc_immutable) {
+   pr_warn_ratelimited("Backwards TSC observed and 
guest with immutable TSC active\n");
+   continue;
+   }
kvm->arch.backwards_tsc_observed = true;
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.tsc_offset_adjustment +=

[RFC PATCH 48/67] KVM: VMX: Define VMCS encodings for shared EPT pointer

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add the VMCS field encoding for the shared EPTP, which will be used by
TDX to have separate EPT walks for private GPAs (existing EPTP) versus
shared GPAs (new shared EPTP).

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/vmx.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 8a3a2e2dc208..7c968f66d926 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -222,6 +222,8 @@ enum vmcs_field {
ENCLS_EXITING_BITMAP_HIGH   = 0x202F,
TSC_MULTIPLIER  = 0x2032,
TSC_MULTIPLIER_HIGH = 0x2033,
+   SHARED_EPT_POINTER  = 0x203C,
+   SHARED_EPT_POINTER_HIGH = 0x203D,
GUEST_PHYSICAL_ADDRESS  = 0x2400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x2401,
VMCS_LINK_POINTER   = 0x2800,
-- 
2.17.1

[RFC PATCH 26/67] KVM: x86: Add kvm_x86_ops .cache_gprs() and .flush_gprs()

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add hooks to cache and flush GPRs and invoke them from KVM_GET_REGS and
KVM_SET_REGS respecitively.  TDX will use the hooks to read/write GPRs
from TDX-SEAM on-demand (for debug TDs).

Cc: Tom Lendacky 
Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 arch/x86/kvm/x86.c  | 6 ++
 2 files changed, 8 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7537ba0bada2..01c78eeefef4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1130,6 +1130,8 @@ struct kvm_x86_ops {
void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
+   void (*cache_gprs)(struct kvm_vcpu *vcpu);
+   void (*flush_gprs)(struct kvm_vcpu *vcpu);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1c57d1eb460..22e956f01ddc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9385,6 +9385,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 
 static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
+   if (kvm_x86_ops.cache_gprs)
+   kvm_x86_ops.cache_gprs(vcpu);
+
if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
/*
 * We are here if userspace calls get_regs() in the middle of
@@ -9459,6 +9462,9 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct 
kvm_regs *regs)
 
vcpu->arch.exception.pending = false;
 
+   if (kvm_x86_ops.flush_gprs)
+   kvm_x86_ops.flush_gprs(vcpu);
+
kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
-- 
2.17.1

[RFC PATCH 67/67] KVM: X86: not for review: add dummy file for TDX-SEAM module

2020-11-16 Thread isaku . yamahata

From: Isaku Yamahata 

This patch is not for review, but to make build success.
Add dummy empty file for TDX-SEAM module as
linux/lib/firmware/intel-seam/libtdx.so.

TDX-SEAM module isn't published. Its specification is at [1].

[1] Intel TDX Module 1.0 EAS
https://software.intel.com/content/dam/develop/external/us/en/documents/intel-tdx-module-1eas.pdf

Signed-off-by: Isaku Yamahata 
---
 lib/firmware/intel-seam/libtdx.so | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 lib/firmware/intel-seam/libtdx.so

diff --git a/lib/firmware/intel-seam/libtdx.so 
b/lib/firmware/intel-seam/libtdx.so
new file mode 100644
index ..e69de29bb2d1
-- 
2.17.1

[RFC PATCH 08/67] KVM: x86/mmu: Zap only leaf SPTEs for deleted/moved memslot by default

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Zap only leaf SPTEs when deleting/moving a memslot by default, and add a
module param to allow reverting to the old behavior of zapping all SPTEs
at all levels and memslots when any memslot is updated.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu/mmu.c | 20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 1f96adff8dc4..3c7e43e12513 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -84,6 +84,9 @@ __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
 static bool __read_mostly force_flush_and_sync_on_reuse;
 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
 
+static bool __read_mostly memslot_update_zap_all;
+module_param(memslot_update_zap_all, bool, 0444);
+
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
  * where the hardware walks 2 page tables:
@@ -5441,11 +5444,26 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm 
*kvm)
return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
 }
 
+static void kvm_mmu_zap_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+   /*
+* Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
+* case scenario we'll have unused shadow pages lying around until they
+* are recycled due to age or when the VM is destroyed.
+*/
+   spin_lock(&kvm->mmu_lock);
+   slot_handle_all_level(kvm, slot, kvm_zap_rmapp, true);
+   spin_unlock(&kvm->mmu_lock);
+}
+
 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot,
struct kvm_page_track_notifier_node *node)
 {
-   kvm_mmu_zap_all_fast(kvm);
+   if (memslot_update_zap_all)
+   kvm_mmu_zap_all_fast(kvm);
+   else
+   kvm_mmu_zap_memslot(kvm, slot);
 }
 
 void kvm_mmu_init_vm(struct kvm *kvm)
-- 
2.17.1

[RFC PATCH 47/67] KVM: VMX: Define EPT Violation architectural bits

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Define the EPT Violation #VE control bit, #VE info VMCS fields, and the
suppress #VE bit for EPT entries.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/vmx.h | 4 
 arch/x86/include/asm/vmxfeatures.h | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f8ba5289ecb0..8a3a2e2dc208 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -67,6 +67,7 @@
 #define SECONDARY_EXEC_ENCLS_EXITING   VMCS_CONTROL_BIT(ENCLS_EXITING)
 #define SECONDARY_EXEC_RDSEED_EXITING  VMCS_CONTROL_BIT(RDSEED_EXITING)
 #define SECONDARY_EXEC_ENABLE_PML   
VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
+#define SECONDARY_EXEC_EPT_VIOLATION_VE
VMCS_CONTROL_BIT(EPT_VIOLATION_VE)
 #define SECONDARY_EXEC_PT_CONCEAL_VMX  VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
 #define SECONDARY_EXEC_XSAVES  VMCS_CONTROL_BIT(XSAVES)
 #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 
VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
@@ -213,6 +214,8 @@ enum vmcs_field {
VMREAD_BITMAP_HIGH  = 0x2027,
VMWRITE_BITMAP  = 0x2028,
VMWRITE_BITMAP_HIGH = 0x2029,
+   VE_INFO_ADDRESS = 0x202A,
+   VE_INFO_ADDRESS_HIGH= 0x202B,
XSS_EXIT_BITMAP = 0x202C,
XSS_EXIT_BITMAP_HIGH= 0x202D,
ENCLS_EXITING_BITMAP= 0x202E,
@@ -495,6 +498,7 @@ enum vmcs_field {
 #define VMX_EPT_IPAT_BIT   (1ull << 6)
 #define VMX_EPT_ACCESS_BIT (1ull << 8)
 #define VMX_EPT_DIRTY_BIT  (1ull << 9)
+#define VMX_EPT_SUPPRESS_VE_BIT(1ull << 63)
 #define VMX_EPT_RWX_MASK(VMX_EPT_READABLE_MASK |   
\
 VMX_EPT_WRITABLE_MASK |   \
 VMX_EPT_EXECUTABLE_MASK)
diff --git a/arch/x86/include/asm/vmxfeatures.h 
b/arch/x86/include/asm/vmxfeatures.h
index 9915990fd8cf..9013e383fee6 100644
--- a/arch/x86/include/asm/vmxfeatures.h
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -75,7 +75,7 @@
 #define VMX_FEATURE_ENCLS_EXITING  ( 2*32+ 15) /* "" VM-Exit on ENCLS 
(leaf dependent) */
 #define VMX_FEATURE_RDSEED_EXITING ( 2*32+ 16) /* "" VM-Exit on RDSEED */
 #define VMX_FEATURE_PAGE_MOD_LOGGING   ( 2*32+ 17) /* "pml" Log dirty pages 
into buffer */
-#define VMX_FEATURE_EPT_VIOLATION_VE   ( 2*32+ 18) /* "" Conditionally reflect 
EPT violations as #VE exceptions */
+#define VMX_FEATURE_EPT_VIOLATION_VE   ( 2*32+ 18) /* Conditionally reflect 
EPT violations as #VE exceptions */
 #define VMX_FEATURE_PT_CONCEAL_VMX ( 2*32+ 19) /* "" Suppress VMX 
indicators in Processor Trace */
 #define VMX_FEATURE_XSAVES ( 2*32+ 20) /* "" Enable XSAVES and 
XRSTORS in guest */
 #define VMX_FEATURE_MODE_BASED_EPT_EXEC( 2*32+ 22) /* 
"ept_mode_based_exec" Enable separate EPT EXEC bits for supervisor vs. user */
-- 
2.17.1

[RFC PATCH 62/67] KVM: TDX: Load and init TDX-SEAM module during boot

2020-11-16 Thread isaku . yamahata

From: Sean Christopherson 

Add a hook into the early boot flow to load TDX-SEAM and do BSP-only
init of TDX-SEAM.

Perform TDSYSINIT, TDSYSINITLP sequence to initialize TDX during kernel
boot.  Call TDSYSINIT on BSP for platform level initialization, and call
TDSYSINITLP for all cpus for per-cpu initialization.

On BSP, also call TDSYSINFO to get TDX info right after TDSYSINITLP.
While TDX initialization on AP is done in identify_cpu() when AP is
brought up, on BSP it is done right after SEAM module is loaded, but
not in identify_cpu(). The reason is constructing TDMRs needs to be
done before kernel normal page allocator is up, since it requires to
reserve large memory for PAMT (>4MB), which kernel page allocator cannot
allocate. And reserving how much memory for PAMT requires TDX info
reteurned by TDSYSINFO, so it also needs to be done in BSP right after
TDSYSINITLP.

Check kernel parameters and other variables that prevent/indicate that
not all logical CPUs can be onlined.  TDSYSINITLP must be called on all
logical CPUs as part of TDX-SEAM configuration, e.g. TDSYSCONFIG is
guaranteed to fail if not all CPUs are onlined.

Query the 'nr_cpus', 'possible_cpus' and 'maxcpus' kernel parameters, as
well as the 'disabled_cpus' counter that can be incremented during ACPI
parsing (CPUs marked as disabled cannot be brought up later).

Note, the kernel ignores the "Online Capable" bit defined in the ACPI
specification v6.3, section 5.2.12.2 Processor Local APIC Structure:

  CPUs marked as disabled ("Enabled" bit cleared) but it can be
  brought up later by OS if "Online Capable" bit is set.

and simply treats ACPI hot-added CPUs as enabled, i.e. with ACPI CPU
hotplug, the aforementioned variables can change dynamically post-boot.
But, CPU hotplug is unsupported on TDX enabled systems, therefore the
variables are effectively constant post-boot TDX.

In the post-SMP boot phase (tdx_init()), verify that all present CPUs
were succesfully booted.  Note that this also covers the SMT=off case,
i.e. verifies that to-be-disabled sibling threads are booted and run
through TDSYSINITLP.

Detect the TDX private keyID range by reading MSR_IA32_MKTME_KEYID_PART,
which is configured by BIOS and partitions the MKTME KeyID space into
regular KeyIDs and TDX-only KeyIDs.  Disable TDX if the partitioning is
not consistent across all CPUs, i.e. if BIOS screwed up.

Construct Trust Domain Memory Regions (TDMRs) based on info reported by
TDSYSINFO.  For simplicity, all system memory is configured as TDMRs,
otherwise page allocator needs to be modified to distinguish normal and
TD memory allocation.  The overhead of marking all memory as TDMRs
consists of the memory needed for TDX-SEAM's Physical Address Metadata
Tables (PAMTs) used to track TDMRs.

TDMRs are constructed (and PAMTs associated with TDMRs are reserved)
on basis of NUMA node for better performance -- when accessing TD
memory in TDMR, CPU doesn't have to access PAMT in remote node.

Sanity check that the CMRs reported by TDSYSINFO have covered all memory
reported in e820, and disable TDX if there is a discrepancy.  If there
is memory available to the kernel (reported in e820) that is not covered
by a TDMR then it's possible the page allocator will allocate a page
that's not usable for a TD's memory, i.e. would break KVM.

Once all enumeration and sanity checking is done, call TDSYSCONFIG,
TDSYSCONFIGKEY and TDSYSINITTDMR to configure and initialize TDMRs.

Signed-off-by: Kai Huang 
Co-developed-by: Xiaoyao Li 
Signed-off-by: Xiaoyao Li 
Co-developed-by: Sean Christopherson 
Signed-off-by: Sean Christopherson 
---
 arch/x86/Kbuild |1 +
 arch/x86/include/asm/kvm_boot.h |   43 +
 arch/x86/kernel/cpu/intel.c |4 +
 arch/x86/kernel/setup.c |3 +
 arch/x86/kvm/Kconfig|8 +
 arch/x86/kvm/boot/Makefile  |5 +
 arch/x86/kvm/boot/seam/seamldr.S|  188 +
 arch/x86/kvm/boot/seam/seamloader.c |  162 
 arch/x86/kvm/boot/seam/tdx.c| 1131 +++
 9 files changed, 1545 insertions(+)
 create mode 100644 arch/x86/include/asm/kvm_boot.h
 create mode 100644 arch/x86/kvm/boot/Makefile
 create mode 100644 arch/x86/kvm/boot/seam/seamldr.S
 create mode 100644 arch/x86/kvm/boot/seam/seamloader.c
 create mode 100644 arch/x86/kvm/boot/seam/tdx.c

diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 30dec019756b..4f35eaad7468 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -4,6 +4,7 @@ obj-y += entry/
 obj-$(CONFIG_PERF_EVENTS) += events/
 
 obj-$(CONFIG_KVM) += kvm/
+obj-$(subst m,y,$(CONFIG_KVM)) += kvm/boot/
 
 # Xen paravirtualization support
 obj-$(CONFIG_XEN) += xen/
diff --git a/arch/x86/include/asm/kvm_boot.h b/arch/x86/include/asm/kvm_boot.h
new file mode 100644
index ..5054fb324283
--- /dev/null
+++ b/arch/x86/include/asm/kvm_boot.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_X86_KVM_BOOT_H
+#define _ASM_X86_KVM_BOOT_H
+

Re: [PATCH 0/3] x86/mtrr, pat: make PAT independent from MTRR

2019-08-13 Thread Isaku Yamahata

On Fri, Aug 09, 2019 at 07:51:17PM +,
"Kani, Toshi"  wrote:

> On Fri, 2019-08-09 at 09:06 +0200, Borislav Petkov wrote:
> > On Thu, Aug 08, 2019 at 08:54:17PM -0700, Isaku Yamahata wrote:
> > > Make PAT(Page Attribute Table) independent from
> > > MTRR(Memory Type Range Register).
> > > Some environments (mainly virtual ones) support only PAT, but not MTRR
> > > because PAT replaces MTRR.
> > > It's tricky and no gain to support both MTRR and PAT except compatibility.
> > > So some VM technologies don't support MTRR, but only PAT.
> 
> I do not think it is technically correct on bare metal.  AFAIK, MTRR is
> still the only way to setup cache attribute in real-mode, which BIOS SMI
> handler relies on in SMM.

Then you're claiming if it's baremetal, both MTRR and PAT should be
enabled/disabled at the same time?


> > > This patch series makes PAT available on such environments without MTRR.
> > 
> > And this "justification" is not even trying. Which "VM technologies" are
> > those? Why do we care? What's the impact? Why do we want this?
> > 
> > You need to sell this properly.
> 
> Agreed.  If the situation is still the same, Xen does not support MTRR,
> and the kernel sets the PAT table to the BIOS hand-off state when MTRR
> is disabled.  The change below accommodated the fact that Xen hypervisor
> enables WC before hand-off, which is different from the default BIOS
> hand-off state.  The kernel does not support setting PAT when MTRR is
> disabled due to the dependency Isaku mentioned.
> 
> 
> https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1107094.html

Thanks for supplement.
In addition to Xen, KVM+qemu can enable/disable MTRR, PAT independently.
So user may want to disable MTRR to reduce attack surface.
ACRN doesn't support MTRR.

Let me include those description for next respin.
-- 
Isaku Yamahata

[PATCH 0/3] x86/mtrr, pat: make PAT independent from MTRR

2019-08-08 Thread Isaku Yamahata

Make PAT(Page Attribute Table) independent from
MTRR(Memory Type Range Register).
Some environments (mainly virtual ones) support only PAT, but not MTRR
because PAT replaces MTRR.
It's tricky and no gain to support both MTRR and PAT except compatibility.
So some VM technologies don't support MTRR, but only PAT.
This patch series makes PAT available on such environments without MTRR.

patch 1 and 2 are only preparation. no logic change, function rename
(mtrr_ => mtrr_pat_ which is commonly used by both MTRR and PAT) and
moving functions out from mtrr specific files to a common file.
patch 3 is an essential patch which makes PAT independent from MTRR.

Isaku Yamahata (3):
  x86/mtrr: split common funcs from mtrr.c
  x86/mtrr: split common funcs from generic.c
  x86/mtrr, pat: make PAT independent from MTRR

 arch/x86/Kconfig  |   1 -
 arch/x86/include/asm/mtrr.h   |  37 ++-
 arch/x86/include/asm/pat.h|   2 +
 arch/x86/kernel/cpu/common.c  |   2 +-
 arch/x86/kernel/cpu/mtrr/Makefile |   2 +-
 arch/x86/kernel/cpu/mtrr/generic.c| 116 +
 arch/x86/kernel/cpu/mtrr/mtrr.c   | 211 +
 arch/x86/kernel/cpu/mtrr/mtrr.h   |   8 +-
 arch/x86/kernel/cpu/mtrr/rendezvous.c | 324 ++
 arch/x86/kernel/setup.c   |   4 +-
 arch/x86/kernel/smpboot.c |   8 +-
 arch/x86/mm/Makefile  |   3 +
 arch/x86/mm/pat.c |  99 +++-
 arch/x86/power/cpu.c  |   2 +-
 14 files changed, 479 insertions(+), 340 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/mtrr/rendezvous.c

-- 
2.17.1

[PATCH 2/3] x86/mtrr: split common funcs from generic.c

2019-08-08 Thread Isaku Yamahata

This is a preparation for make PAT(Page Attribute Table) independent
from MTRR(Memory Type Range Register).
It renames prefix of common functions in mtrr/generic.c from mtrr_ to
mtrr_pat_ which are commonly used by both MTRR and PAT and moves out
them from mtrr/generic.c to rendezvous.c.
Only prefix rename and movement, no logic change.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/include/asm/mtrr.h   |   4 +
 arch/x86/kernel/cpu/mtrr/generic.c| 111 ++
 arch/x86/kernel/cpu/mtrr/mtrr.c   |   2 +-
 arch/x86/kernel/cpu/mtrr/mtrr.h   |   3 +-
 arch/x86/kernel/cpu/mtrr/rendezvous.c |  91 +
 5 files changed, 106 insertions(+), 105 deletions(-)

diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index d90e87c55302..5b056374f5a6 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -33,6 +33,8 @@
  */
 # ifdef CONFIG_MTRR
 extern bool mtrr_enabled(void);
+extern void mtrr_pat_prepare_set(void) __acquires(set_atomicity_lock);
+extern void mtrr_pat_post_set(void) __releases(set_atomicity_lock);
 extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
 extern void mtrr_save_fixed_ranges(void *);
 extern void mtrr_save_state(void);
@@ -55,6 +57,8 @@ static inline bool mtrr_enabled(void)
 {
return false;
 }
+static inline void mtrr_pat_prepare_set(void) { };
+static inline void mtrr_pat_post_set(void) { };
 static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
 {
/*
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c 
b/arch/x86/kernel/cpu/mtrr/generic.c
index aa5c064a6a22..a44f05f64846 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -397,9 +397,6 @@ print_fixed(unsigned base, unsigned step, const mtrr_type 
*types)
}
 }
 
-static void prepare_set(void);
-static void post_set(void);
-
 static void __init print_mtrr_state(void)
 {
unsigned int i;
@@ -445,20 +442,6 @@ static void __init print_mtrr_state(void)
pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
 }
 
-/* PAT setup for BP. We need to go through sync steps here */
-void __init mtrr_bp_pat_init(void)
-{
-   unsigned long flags;
-
-   local_irq_save(flags);
-   prepare_set();
-
-   pat_init();
-
-   post_set();
-   local_irq_restore(flags);
-}
-
 /* Grab all of the MTRR state for this CPU into *state */
 bool __init get_mtrr_state(void)
 {
@@ -680,8 +663,6 @@ static bool set_mtrr_var_ranges(unsigned int index, struct 
mtrr_var_range *vr)
return changed;
 }
 
-static u32 deftype_lo, deftype_hi;
-
 /**
  * set_mtrr_state - Set the MTRR state for this CPU.
  *
@@ -705,100 +686,24 @@ static unsigned long set_mtrr_state(void)
 * Set_mtrr_restore restores the old value of MTRRdefType,
 * so to set it we fiddle with the saved value:
 */
-   if ((deftype_lo & 0xff) != mtrr_state.def_type
-   || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
+   if ((mtrr_deftype_lo & 0xff) != mtrr_state.def_type
+   || ((mtrr_deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
 
-   deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type |
-(mtrr_state.enabled << 10);
+   mtrr_deftype_lo = (mtrr_deftype_lo & ~0xcff) |
+   mtrr_state.def_type | (mtrr_state.enabled << 10);
change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
}
 
return change_mask;
 }
 
-
-static unsigned long cr4;
-static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
-
-/*
- * Since we are disabling the cache don't allow any interrupts,
- * they would run extremely slow and would only increase the pain.
- *
- * The caller must ensure that local interrupts are disabled and
- * are reenabled after post_set() has been called.
- */
-static void prepare_set(void) __acquires(set_atomicity_lock)
-{
-   unsigned long cr0;
-
-   /*
-* Note that this is not ideal
-* since the cache is only flushed/disabled for this CPU while the
-* MTRRs are changed, but changing this requires more invasive
-* changes to the way the kernel boots
-*/
-
-   raw_spin_lock(&set_atomicity_lock);
-
-   /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
-   cr0 = read_cr0() | X86_CR0_CD;
-   write_cr0(cr0);
-
-   /*
-* Cache flushing is the most time-consuming step when programming
-* the MTRRs. Fortunately, as per the Intel Software Development
-* Manual, we can skip it if the processor supports cache self-
-* snooping.
-*/
-   if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
-   wbinvd();
-
-   /* Save value of CR4 and clear Page Global Enable (bit 7) */
-   if (boot_cpu_has(X86_FEATURE_PGE)) {
-   cr4 = __read_cr4();
-

[PATCH 3/3] x86/mtrr, pat: make PAT independent from MTRR

2019-08-08 Thread Isaku Yamahata

This patch makes PAT(Page Attribute Table) independent from
MTRR(Memory Type Range Register)
Some environments (mainly virtual ones) support only PAT, not MTRR.
It's tricky and no gain to support both MTRR and PAT at the
same time except compatibility because PAT replaces MTRR.
So some VM technologies don't support MTRR, but only PAT.
This patch make PAT available on such environments without MTRR.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/Kconfig  |  1 -
 arch/x86/include/asm/mtrr.h   | 32 +
 arch/x86/include/asm/pat.h|  2 +
 arch/x86/kernel/cpu/mtrr/generic.c|  5 --
 arch/x86/kernel/cpu/mtrr/mtrr.c   |  8 +--
 arch/x86/kernel/cpu/mtrr/mtrr.h   |  1 -
 arch/x86/kernel/cpu/mtrr/rendezvous.c | 76 +++-
 arch/x86/mm/Makefile  |  3 +
 arch/x86/mm/pat.c | 99 ---
 9 files changed, 158 insertions(+), 69 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 222855cc0158..5654283e010f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1838,7 +1838,6 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
 config X86_PAT
def_bool y
prompt "x86 PAT support" if EXPERT
-   depends on MTRR
---help---
  Use PAT attributes to setup page level cache control.
 
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 5b056374f5a6..a401ad106c28 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -31,10 +31,25 @@
  * The following functions are for use by other drivers that cannot use
  * arch_phys_wc_add and arch_phys_wc_del.
  */
-# ifdef CONFIG_MTRR
-extern bool mtrr_enabled(void);
+#if defined(CONFIG_MTRR) || defined(CONFIG_X86_PAT)
+/* common method for MTRR and PAT */
 extern void mtrr_pat_prepare_set(void) __acquires(set_atomicity_lock);
 extern void mtrr_pat_post_set(void) __releases(set_atomicity_lock);
+extern void mtrr_pat_ap_init(void);
+extern void set_mtrr_pat_aps_delayed_init(void);
+extern void mtrr_pat_aps_init(void);
+extern void mtrr_pat_bp_restore(void);
+#else
+static inline void mtrr_pat_prepare_set(void) { }
+static inline void mtrr_pat_post_set(void) { }
+static inline void mtrr_pat_ap_init(void) { };
+static inline void set_mtrr_pat_aps_delayed_init(void) { };
+static inline void mtrr_pat_aps_init(void) { };
+static inline void mtrr_pat_bp_restore(void) { };
+#endif
+
+# ifdef CONFIG_MTRR
+extern bool mtrr_enabled(void);
 extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
 extern void mtrr_save_fixed_ranges(void *);
 extern void mtrr_save_state(void);
@@ -45,11 +60,7 @@ extern int mtrr_add_page(unsigned long base, unsigned long 
size,
 extern int mtrr_del(int reg, unsigned long base, unsigned long size);
 extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
 extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
-extern void mtrr_pat_ap_init(void);
 extern void mtrr_pat_bp_init(void);
-extern void set_mtrr_pat_aps_delayed_init(void);
-extern void mtrr_pat_aps_init(void);
-extern void mtrr_pat_bp_restore(void);
 extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
 extern int amd_special_default_mtrr(void);
 #  else
@@ -57,8 +68,6 @@ static inline bool mtrr_enabled(void)
 {
return false;
 }
-static inline void mtrr_pat_prepare_set(void) { };
-static inline void mtrr_pat_post_set(void) { };
 static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
 {
/*
@@ -95,13 +104,8 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, 
u32 hi)
 }
 static inline void mtrr_pat_bp_init(void)
 {
-   pat_disable("MTRRs disabled, skipping PAT initialization too.");
+   pat_bp_init();
 }
-
-static inline void mtrr_pat_ap_init(void) { };
-static inline void set_mtrr_pat_aps_delayed_init(void) { };
-static inline void mtrr_pat_aps_init(void) { };
-static inline void mtrr_pat_bp_restore(void) { };
 #  endif
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 92015c65fa2a..2a355ce94ebf 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -7,7 +7,9 @@
 
 bool pat_enabled(void);
 void pat_disable(const char *reason);
+extern void pat_set(void);
 extern void pat_init(void);
+extern void pat_bp_init(void);
 extern void init_cache_modes(void);
 
 extern int reserve_memtype(u64 start, u64 end,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c 
b/arch/x86/kernel/cpu/mtrr/generic.c
index a44f05f64846..f9a7ca79e2c2 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -6,13 +6,8 @@
 #define DEBUG
 
 #include 
-#include 
-#include 
 #include 
 
-#include 
-#include 
-#include 
 #include 
 #include 
 #include 
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 475627ca2c1b..2d28c9b37ae7 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/ker

[PATCH 1/3] x86/mtrr: split common funcs from mtrr.c

2019-08-08 Thread Isaku Yamahata

This is a preparation for make PAT(Page Attribute Table) independent
from MTRR(Memory Type Range Register).
It renames prefix of common functions in mtrr.c from mtrr_ to
mtrr_pat_ which are commonly used by both MTRR and PAT and moves out
them from mtrr.c to rendezvous.c.
Only prefix rename and movement, no logic change.

Signed-off-by: Isaku Yamahata 
---
 arch/x86/include/asm/mtrr.h   |  25 +--
 arch/x86/kernel/cpu/common.c  |   2 +-
 arch/x86/kernel/cpu/mtrr/Makefile |   2 +-
 arch/x86/kernel/cpu/mtrr/mtrr.c   | 201 ++-
 arch/x86/kernel/cpu/mtrr/mtrr.h   |   6 +
 arch/x86/kernel/cpu/mtrr/rendezvous.c | 221 ++
 arch/x86/kernel/setup.c   |   4 +-
 arch/x86/kernel/smpboot.c |   8 +-
 arch/x86/power/cpu.c  |   2 +-
 9 files changed, 260 insertions(+), 211 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/mtrr/rendezvous.c

diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index dbff1456d215..d90e87c55302 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -32,6 +32,7 @@
  * arch_phys_wc_add and arch_phys_wc_del.
  */
 # ifdef CONFIG_MTRR
+extern bool mtrr_enabled(void);
 extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
 extern void mtrr_save_fixed_ranges(void *);
 extern void mtrr_save_state(void);
@@ -42,14 +43,18 @@ extern int mtrr_add_page(unsigned long base, unsigned long 
size,
 extern int mtrr_del(int reg, unsigned long base, unsigned long size);
 extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
 extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
-extern void mtrr_ap_init(void);
-extern void mtrr_bp_init(void);
-extern void set_mtrr_aps_delayed_init(void);
-extern void mtrr_aps_init(void);
-extern void mtrr_bp_restore(void);
+extern void mtrr_pat_ap_init(void);
+extern void mtrr_pat_bp_init(void);
+extern void set_mtrr_pat_aps_delayed_init(void);
+extern void mtrr_pat_aps_init(void);
+extern void mtrr_pat_bp_restore(void);
 extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
 extern int amd_special_default_mtrr(void);
 #  else
+static inline bool mtrr_enabled(void)
+{
+   return false;
+}
 static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
 {
/*
@@ -84,15 +89,15 @@ static inline int mtrr_trim_uncached_memory(unsigned long 
end_pfn)
 static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
 {
 }
-static inline void mtrr_bp_init(void)
+static inline void mtrr_pat_bp_init(void)
 {
pat_disable("MTRRs disabled, skipping PAT initialization too.");
 }
 
-#define mtrr_ap_init() do {} while (0)
-#define set_mtrr_aps_delayed_init() do {} while (0)
-#define mtrr_aps_init() do {} while (0)
-#define mtrr_bp_restore() do {} while (0)
+static inline void mtrr_pat_ap_init(void) { };
+static inline void set_mtrr_pat_aps_delayed_init(void) { };
+static inline void mtrr_pat_aps_init(void) { };
+static inline void mtrr_pat_bp_restore(void) { };
 #  endif
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 11472178e17f..39b7942cb6fc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1550,7 +1550,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_32
enable_sep_cpu();
 #endif
-   mtrr_ap_init();
+   mtrr_pat_ap_init();
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
 }
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile 
b/arch/x86/kernel/cpu/mtrr/Makefile
index cc4f9f1cb94c..e339d729f349 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-y  := mtrr.o if.o generic.o cleanup.o
+obj-y  := mtrr.o if.o generic.o cleanup.o rendezvous.o
 obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
 
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 507039c20128..3d35edb1aa42 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -35,7 +35,6 @@
 
 #include  /* FIXME: kvm_para.h needs this */
 
-#include 
 #include 
 #include 
 #include 
@@ -46,10 +45,7 @@
 #include 
 #include 
 #include 
-#include 
 
-#include 
-#include 
 #include 
 #include 
 #include 
@@ -62,7 +58,7 @@
 u32 num_var_ranges;
 static bool __mtrr_enabled;
 
-static bool mtrr_enabled(void)
+bool mtrr_enabled(void)
 {
return __mtrr_enabled;
 }
@@ -71,15 +67,11 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 static DEFINE_MUTEX(mtrr_mutex);
 
 u64 size_or_mask, size_and_mask;
-static bool mtrr_aps_delayed_init;
 
 static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init;
 
 const struct mtrr_ops *mtrr_if;
 
-static void set_mtrr(unsigned int reg, unsigned long base,
-unsigned long size, mtrr_type type);
-
 void __init set_mtrr_ops(const struc

Re: [PATCH 0/4] madvise(MADV_USERFAULT) & sys_remap_anon_pages()

2013-05-07 Thread Isaku Yamahata

On Mon, May 06, 2013 at 09:56:57PM +0200, Andrea Arcangeli wrote:
> Hello everyone,
> 
> this is a patchset to implement two new kernel features:
> MADV_USERFAULT and remap_anon_pages.
> 
> The combination of the two features are what I would propose to
> implement postcopy live migration, and in general demand paging of
> remote memory, hosted in different cloud nodes with KSM. It might also
> be used without virt to offload parts of memory to different nodes
> using some userland library and a network memory manager.

Interesting. The API you are proposing handles only user fault.
How do you think about kernel case. I mean that KVM kernel module issues
get_user_pages().
Exit to qemu with dedicated reason?


> Postcopy live migration is currently implemented using a chardevice,
> which remains open for the whole VM lifetime and all virtual memory
> then becomes owned by the chardevice and it's not anonymous anymore.
> 
> http://lists.gnu.org/archive/html/qemu-devel/2012-10/msg05274.html
> 
> The main cons of the chardevice design is that all nice Linux MM
> features (like swapping/THP/KSM/automatic-NUMA-balancing) are disabled
> if the guest physical memory doesn't remain in anonymous memory. This
> is entirely solved by this alternative kernel solution. In fact
> remap_anon_pages will move THP pages natively by just updating two pmd
> pointers if alignment and length permits without any THP split.
> 
> The other bonus is that MADV_USERFAULT and remap_anon_pages are
> implemented in the MM core and remap_anon_pages furthermore provides a
> functionality similar to what is already available for filebacked
> pages with remap_file_pages. That is usually more maintainable than
> having MM parts in a chardevice.
> 
> In addition to asking review of the internals, this also need review
> the user APIs, as both those features are userland visible changes.
> 
> MADV_USERFAULT is only enabled for anonymous mappings so far but it
> could be extended. To be strict, -EINVAL is returned if run on non
> anonymous mappings (where it would currently be a noop).
> 
> The remap_anon_pages syscall API is not vectored, as I expect it used
> for demand paging only (where there can be just one faulting range per
> fault) or for large ranges where vectoring isn't going to provide
> performance advantages.

In case of precopy + postcopy optimization, dirty bitmap is sent after 
precopy phase and then clean pages are populated. In this population phase,
vecotored API can be utilized. I'm not sure how much vectored API will
contribute to shorten VM-switch time, though.


> The current behavior of remap_anon_pages is very strict to avoid any
> chance of memory corruption going unnoticed, and it will return
> -EFAULT at the first sign of something unexpected (like a page already
> mapped in the destination pmd/pte, potentially signaling an userland
> thread race condition with two threads userfaulting on the same
> destination address). mremap is not strict like that: it would drop
> the destination range silently and it would succeed in such a
> condition. So on the API side, I wonder if I should add a flag to
> remap_anon_pages to provide non-strict behavior more similar to
> mremap. OTOH not providing the permissive mremap behavior may actually
> be better to force userland to be strict and be sure it knows what it
> is doing (otherwise it should use mremap in the first place?).

It would be desirable to avoid complex thing in signal handler.
Like sending page request to remote, receiving pages from remote.
So signal handler would just queue requests to those dedicated threads
and wait and requests would be serialized. Such strictness is not 
very critical, I guess. But others might find other use case...

thanks,

> Comments welcome, thanks!
> Andrea
> 
> Andrea Arcangeli (4):
>   mm: madvise MADV_USERFAULT
>   mm: rmap preparation for remap_anon_pages
>   mm: swp_entry_swapcount
>   mm: sys_remap_anon_pages
> 
>  arch/alpha/include/uapi/asm/mman.h |   3 +
>  arch/mips/include/uapi/asm/mman.h  |   3 +
>  arch/parisc/include/uapi/asm/mman.h|   3 +
>  arch/x86/syscalls/syscall_32.tbl   |   1 +
>  arch/x86/syscalls/syscall_64.tbl   |   1 +
>  arch/xtensa/include/uapi/asm/mman.h|   3 +
>  include/linux/huge_mm.h|   6 +
>  include/linux/mm.h |   1 +
>  include/linux/mm_types.h   |   2 +-
>  include/linux/swap.h   |   6 +
>  include/linux/syscalls.h   |   3 +
>  include/uapi/asm-generic/mman-common.h |   3 +
>  kernel/sys_ni.c|   1 +
>  mm/fremap.c| 440 
> +
>  mm/huge_memory.c   | 158 ++--
>  mm/madvise.c   |  16 ++
>  mm/memory.c|  10 +
>  mm/rmap.c  |   9 +
>  mm/swapfile.c  |  13 +
>  19 files changed, 667 insertions(+),

[PATCH v4 2/2] umem: chardevice for kvm postcopy

2012-10-30 Thread Isaku Yamahata

This is a character device to hook page access.
The page fault in the area is propagated to another user process by
this chardriver. Then, the process fills the page contents and
resolves the page fault.

Cc: Andrea Arcangeli 
Cc: Avi Kivity 
Cc: Paolo Bonzini 
Signed-off-by: Isaku Yamahata 

---
Changes v4 -> v5:
- rename umem to uvmem to avoid name conflict

Changes v3 -> v4:
- simplified umem_init: kill {a,}sync_req_max
- make fault handler killable even when core-dumping
- documentation

Changes v2 -> v3:
- made fault handler killable
- allow O_LARGEFILE
- improve to handle FAULT_FLAG_ALLOW_RETRY
- smart on async fault
---
 Documentation/misc-devices/uvmem.txt |  292 
 drivers/char/Kconfig |   10 +
 drivers/char/Makefile|1 +
 drivers/char/uvmem.c |  841 ++
 include/linux/uvmem.h|   41 ++
 5 files changed, 1185 insertions(+)
 create mode 100644 Documentation/misc-devices/uvmem.txt
 create mode 100644 drivers/char/uvmem.c
 create mode 100644 include/linux/uvmem.h

diff --git a/Documentation/misc-devices/uvmem.txt 
b/Documentation/misc-devices/uvmem.txt
new file mode 100644
index 000..a9c15a2
--- /dev/null
+++ b/Documentation/misc-devices/uvmem.txt
@@ -0,0 +1,292 @@
+User process backed memory driver
+=
+
+Intro
+=
+User process backed memory driver provides /dev/uvmem device.
+This /dev/uvmem device is designed for some sort of distributed shared memory.
+Especially post-copy live migration with KVM.
+
+page fault in the area backed by this driver is propagated to (other) server
+process which serves the page contents. Usually the server process fetches
+page contents from the remote machine. Then the faulting process continues.
+
+
+Kernel-User protocol
+
+ioctl
+UVMEM_INIT: Initialize the uvmem device with some parameters.
+  IN size: the area size in bytes (which is rounded up to page size)
+  OUT shmem_fd: the file descript to tmpfs that is associated to this uvmem
+device This is served as backing store of this uvmem device.
+
+mmap: Mapping the initialized uvmem device provides the area which
+  is served by user process.
+  The fault in this area is propagated to uvmem device via read
+  system call.
+read: kernel notifies a process that pages are faulted by returning
+  page offset in page size in u64 format.
+  uvmem device is pollable for read.
+write: Process notifies kernel that the page is ready to access
+   by writing page offset in page size in u64 format.
+
+
+operation flow
+==
+
+|
+V
+  open(/dev/uvmem)
+|
+V
+  ioctl(UVMEM_INIT)
+|
+V
+  Here we have two file descriptors to
+  uvmem device and shmem file
+|
+|  daemon process which serves
+|  page fault
+V
+  fork()---,
+|  |
+V  V
+  close(shmem) mmap(shmem file)
+|  |
+V  V
+  mmap(uvmem device)   close(shmem file)
+|  |
+V  |
+  close(uvmem device)   |
+|  |
+  now the setup is done|
+  work on the uvmem area|
+|  |
+V  V
+  access uvmem area (poll and) read(uvmem)
+|  |
+V  V
+  page fault --> read system call returns
+  block  page offsets
+   |
+   V
+create page contents
+(usually pull the page
+ from remote)
+write the page contents
+to the shmem which was
+m

[PATCH v4 0/2] postcopy migration: uvmem: Linux char device for postcopy

2012-10-30 Thread Isaku Yamahata

This is Linux kernel driver for qemu/kvm postcopy live migration.
This is used by qemu/kvm postcopy live migration patch.

User process backed memory driver provides /dev/uvmem device.
This /dev/uvmem device is designed for some sort of distributed shared memory.
page fault in the area backed by this driver is propagated to (other) server
process which serves the page contents. Usually the server process fetches
page contents from the remote machine. Then the faulting process continues.


ioctl UVMEM_INIT: initialize uvmem device for qemu.
  Returns file descriptor of tmpfs, serving thread write
  page contents to this file descriptor.
mmap: Guest VM mmaps this device and use it as guest RAM. page fault on
  this area will be propagated to the service process.
read: returns page offset that guest VM page-faulted.
write: server process notifies the device which pages are served, then
   guest VM can resume execution.
---
Changes v3 -> v4:
- rename module name: umem -> uvmem
  avoid module name conflict

Changes v2 -> v3:
- make fault handler killable
- make use of read()/write()
- documentation

Changes version 1 -> 2:
- make ioctl structures padded to align
- un-KVM
  KVM_VMEM -> UMEM
- dropped some ioctl commands as Avi requested

Isaku Yamahata (2):
  export necessary symbols
  umem: chardevice for kvm postcopy

 Documentation/misc-devices/uvmem.txt |  292 
 drivers/char/Kconfig |   10 +
 drivers/char/Makefile|1 +
 drivers/char/uvmem.c |  841 ++
 include/linux/uvmem.h|   41 ++
 mm/memcontrol.c  |1 +
 mm/mempolicy.c   |1 +
 mm/shmem.c   |1 +
 8 files changed, 1188 insertions(+)
 create mode 100644 Documentation/misc-devices/uvmem.txt
 create mode 100644 drivers/char/uvmem.c
 create mode 100644 include/linux/uvmem.h

--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v4 1/2] export necessary symbols

2012-10-30 Thread Isaku Yamahata

Cc: Andrea Arcangeli 
Cc: Avi Kivity 
Cc: Paolo Bonzini 
Signed-off-by: Isaku Yamahata 
---
 mm/memcontrol.c |1 +
 mm/mempolicy.c  |1 +
 mm/shmem.c  |1 +
 3 files changed, 3 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7acf43b..bc9fd53 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2787,6 +2787,7 @@ int mem_cgroup_newpage_charge(struct page *page,
return mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_ANON);
 }
+EXPORT_SYMBOL_GPL(mem_cgroup_cache_charge);
 
 /*
  * While swap-in, try_charge -> commit or cancel, the page is locked.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d04a8a5..3df6cf5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1947,6 +1947,7 @@ retry_cpuset:
goto retry_cpuset;
return page;
 }
+EXPORT_SYMBOL_GPL(alloc_pages_vma);
 
 /**
  * alloc_pages_current - Allocate pages.
diff --git a/mm/shmem.c b/mm/shmem.c
index 67afba5..41eaefd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2840,6 +2840,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
vma->vm_ops = &shmem_vm_ops;
return 0;
 }
+EXPORT_SYMBOL_GPL(shmem_zero_setup);
 
 /**
  * shmem_read_mapping_page_gfp - read into page cache, using specified page 
allocation flags.
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 09/11] xen: make grant table arch portable.

2008-02-21 Thread Isaku Yamahata

split out x86 specific part from grant-table.c

Signed-off-by: Isaku Yamahata <[EMAIL PROTECTED]>
---
 arch/x86/xen/Makefile  |2 +-
 arch/x86/xen/grant-table.c |   91 
 drivers/xen/grant-table.c  |   35 +---
 include/xen/grant_table.h  |6 +++
 4 files changed, 101 insertions(+), 33 deletions(-)
 create mode 100644 arch/x86/xen/grant-table.c

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 95c5926..3d8df98 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
 obj-y  := enlighten.o setup.o multicalls.o mmu.o \
-   time.o manage.o xen-asm.o
+   time.o manage.o xen-asm.o grant-table.o
 
 obj-$(CONFIG_SMP)  += smp.o
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
new file mode 100644
index 000..49ba9b5
--- /dev/null
+++ b/arch/x86/xen/grant-table.c
@@ -0,0 +1,91 @@
+/**
+ * grant_table.c
+ * x86 specific part
+ *
+ * Granting foreign access to our memory reservation.
+ *
+ * Copyright (c) 2005-2006, Christopher Clark
+ * Copyright (c) 2004-2005, K A Fraser
+ * Copyright (c) 2008 Isaku Yamahata 
+ *VA Linux Systems Japan. Split out x86 specific part.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+#include 
+
+static int map_pte_fn(pte_t *pte, struct page *pmd_page,
+ unsigned long addr, void *data)
+{
+   unsigned long **frames = (unsigned long **)data;
+
+   set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
+   (*frames)++;
+   return 0;
+}
+
+static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
+   unsigned long addr, void *data)
+{
+
+   set_pte_at(&init_mm, addr, pte, __pte(0));
+   return 0;
+}
+
+int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
+  unsigned long max_nr_gframes,
+  struct grant_entry **__shared)
+{
+   int rc;
+   struct grant_entry *shared = *__shared;
+
+   if (shared == NULL) {
+   struct vm_struct *area =
+   xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes);
+   BUG_ON(area == NULL);
+   shared = area->addr;
+   *__shared = shared;
+   }
+
+   rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+PAGE_SIZE * nr_gframes,
+map_pte_fn, &frames);
+   return rc;
+}
+
+void arch_gnttab_unmap_shared(struct grant_entry *shared,
+ unsigned long nr_gframes)
+{
+   apply_to_page_range(&init_mm, (unsigned long)shared,
+   PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
+}
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 9fcde20..22f5104 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -435,24 +435,6 @@ static inline unsigned int max_nr_grant_frames(void)
return xen_max;
 }
 
-static int map_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
-{
-   unsigned long **frames = (unsigned long **)data;
-
-   set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
-   (*frames)++;
-   return 0;
-}
-
-static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
-

[PATCH 07/11] xen: make include/xen/page.h portable moving those definitions under asm dir.

2008-02-21 Thread Isaku Yamahata

Those definitions in include/asm/xen/page.h are arch specific.
ia64/xen wants to define its own version. So move them to arch specific
directory and keep include/xen/page.h in order not to break compilation.

Signed-off-by: Isaku Yamahata <[EMAIL PROTECTED]>
---
 include/{ => asm-x86}/xen/page.h |0 
 include/xen/page.h   |  181 +-
 2 files changed, 1 insertions(+), 180 deletions(-)
 copy include/{ => asm-x86}/xen/page.h (100%)

diff --git a/include/xen/page.h b/include/asm-x86/xen/page.h
similarity index 100%
copy from include/xen/page.h
copy to include/asm-x86/xen/page.h
diff --git a/include/xen/page.h b/include/xen/page.h
index 031ef22..eaf85fa 100644
--- a/include/xen/page.h
+++ b/include/xen/page.h
@@ -1,180 +1 @@
-#ifndef __XEN_PAGE_H
-#define __XEN_PAGE_H
-
-#include 
-
-#include 
-#include 
-
-#include 
-
-#ifdef CONFIG_X86_PAE
-/* Xen machine address */
-typedef struct xmaddr {
-   unsigned long long maddr;
-} xmaddr_t;
-
-/* Xen pseudo-physical address */
-typedef struct xpaddr {
-   unsigned long long paddr;
-} xpaddr_t;
-#else
-/* Xen machine address */
-typedef struct xmaddr {
-   unsigned long maddr;
-} xmaddr_t;
-
-/* Xen pseudo-physical address */
-typedef struct xpaddr {
-   unsigned long paddr;
-} xpaddr_t;
-#endif
-
-#define XMADDR(x)  ((xmaddr_t) { .maddr = (x) })
-#define XPADDR(x)  ((xpaddr_t) { .paddr = (x) })
-
-/ MACHINE <-> PHYSICAL CONVERSION MACROS /
-#define INVALID_P2M_ENTRY  (~0UL)
-#define FOREIGN_FRAME_BIT  (1UL<<31)
-#define FOREIGN_FRAME(m)   ((m) | FOREIGN_FRAME_BIT)
-
-extern unsigned long *phys_to_machine_mapping;
-
-static inline unsigned long pfn_to_mfn(unsigned long pfn)
-{
-   if (xen_feature(XENFEAT_auto_translated_physmap))
-   return pfn;
-
-   return phys_to_machine_mapping[(unsigned int)(pfn)] &
-   ~FOREIGN_FRAME_BIT;
-}
-
-static inline int phys_to_machine_mapping_valid(unsigned long pfn)
-{
-   if (xen_feature(XENFEAT_auto_translated_physmap))
-   return 1;
-
-   return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
-}
-
-static inline unsigned long mfn_to_pfn(unsigned long mfn)
-{
-   unsigned long pfn;
-
-   if (xen_feature(XENFEAT_auto_translated_physmap))
-   return mfn;
-
-#if 0
-   if (unlikely((mfn >> machine_to_phys_order) != 0))
-   return max_mapnr;
-#endif
-
-   pfn = 0;
-   /*
-* The array access can fail (e.g., device space beyond end of RAM).
-* In such cases it doesn't matter what we return (we return garbage),
-* but we must handle the fault without crashing!
-*/
-   __get_user(pfn, &machine_to_phys_mapping[mfn]);
-
-   return pfn;
-}
-
-static inline xmaddr_t phys_to_machine(xpaddr_t phys)
-{
-   unsigned offset = phys.paddr & ~PAGE_MASK;
-   return XMADDR(PFN_PHYS((u64)pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
-}
-
-static inline xpaddr_t machine_to_phys(xmaddr_t machine)
-{
-   unsigned offset = machine.maddr & ~PAGE_MASK;
-   return XPADDR(PFN_PHYS((u64)mfn_to_pfn(PFN_DOWN(machine.maddr))) | 
offset);
-}
-
-/*
- * We detect special mappings in one of two ways:
- *  1. If the MFN is an I/O page then Xen will set the m2p entry
- * to be outside our maximum possible pseudophys range.
- *  2. If the MFN belongs to a different domain then we will certainly
- * not have MFN in our p2m table. Conversely, if the page is ours,
- * then we'll have p2m(m2p(MFN))==MFN.
- * If we detect a special mapping then it doesn't have a 'struct page'.
- * We force !pfn_valid() by returning an out-of-range pointer.
- *
- * NB. These checks require that, for any MFN that is not in our reservation,
- * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
- * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
- * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
- *
- * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
- *  use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
- *  require. In all the cases we care about, the FOREIGN_FRAME bit is
- *  masked (e.g., pfn_to_mfn()) so behaviour there is correct.
- */
-static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
-{
-   extern unsigned long max_mapnr;
-   unsigned long pfn = mfn_to_pfn(mfn);
-   if ((pfn < max_mapnr)
-   && !xen_feature(XENFEAT_auto_translated_physmap)
-   && (phys_to_machine_mapping[pfn] != mfn))
-   return max_mapnr; /* force !pfn_valid() */
-   return pfn;
-}
-
-static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
-   if (xen_feature(XENFEAT_auto_translated_physmap)) {
-   BUG_ON(pfn != mfn && mfn != INVALID_P2M_E

[PATCH 11/11] xen: import arch generic part of xencomm.

2008-02-21 Thread Isaku Yamahata

On xen/ia64 and xen/powerpc hypercall arguments are passed by pseudo
physical address (guest physical address) so that it's necessary to
convert from virtual address into pseudo physical address. The frame
work is called xencomm.
Import arch generic part of xencomm.

Signed-off-by: Isaku Yamahata <[EMAIL PROTECTED]>
---
 drivers/xen/Makefile|1 +
 drivers/xen/xencomm.c   |  232 +++
 include/xen/interface/xencomm.h |   41 +++
 include/xen/xencomm.h   |   77 +
 4 files changed, 351 insertions(+), 0 deletions(-)
 create mode 100644 drivers/xen/xencomm.c
 create mode 100644 include/xen/interface/xencomm.h
 create mode 100644 include/xen/xencomm.h

diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 823ce78..43f014c 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,2 +1,3 @@
 obj-y  += grant-table.o features.o events.o
 obj-y  += xenbus/
+obj-$(CONFIG_XEN_XENCOMM)  += xencomm.o
diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c
new file mode 100644
index 000..797cb4e
--- /dev/null
+++ b/drivers/xen/xencomm.c
@@ -0,0 +1,232 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Copyright (C) IBM Corp. 2006
+ *
+ * Authors: Hollis Blanchard <[EMAIL PROTECTED]>
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#ifdef __ia64__
+#include/* for is_kern_addr() */
+#endif
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include 
+#endif
+
+static int xencomm_init(struct xencomm_desc *desc,
+   void *buffer, unsigned long bytes)
+{
+   unsigned long recorded = 0;
+   int i = 0;
+
+   while ((recorded < bytes) && (i < desc->nr_addrs)) {
+   unsigned long vaddr = (unsigned long)buffer + recorded;
+   unsigned long paddr;
+   int offset;
+   int chunksz;
+
+   offset = vaddr % PAGE_SIZE; /* handle partial pages */
+   chunksz = min(PAGE_SIZE - offset, bytes - recorded);
+
+   paddr = xencomm_vtop(vaddr);
+   if (paddr == ~0UL) {
+   printk(KERN_DEBUG "%s: couldn't translate vaddr %lx\n",
+  __func__, vaddr);
+   return -EINVAL;
+   }
+
+   desc->address[i++] = paddr;
+   recorded += chunksz;
+   }
+
+   if (recorded < bytes) {
+   printk(KERN_DEBUG
+  "%s: could only translate %ld of %ld bytes\n",
+  __func__, recorded, bytes);
+   return -ENOSPC;
+   }
+
+   /* mark remaining addresses invalid (just for safety) */
+   while (i < desc->nr_addrs)
+   desc->address[i++] = XENCOMM_INVALID;
+
+   desc->magic = XENCOMM_MAGIC;
+
+   return 0;
+}
+
+static struct xencomm_desc *xencomm_alloc(gfp_t gfp_mask,
+ void *buffer, unsigned long bytes)
+{
+   struct xencomm_desc *desc;
+   unsigned long buffer_ulong = (unsigned long)buffer;
+   unsigned long start = buffer_ulong & PAGE_MASK;
+   unsigned long end = (buffer_ulong + bytes) | ~PAGE_MASK;
+   unsigned long nr_addrs = (end - start + 1) >> PAGE_SHIFT;
+   unsigned long size = sizeof(*desc) +
+   sizeof(desc->address[0]) * nr_addrs;
+
+   /*
+* slab allocator returns at least sizeof(void*) aligned pointer.
+* When sizeof(*desc) > sizeof(void*), struct xencomm_desc might
+* cross page boundary.
+*/
+   if (sizeof(*desc) > sizeof(void *)) {
+   unsigned long order = get_order(size);
+   desc = (struct xencomm_desc *)__get_free_pages(gfp_mask,
+  order);
+   if (desc == NULL)
+   return NULL;
+
+   desc->nr_addrs =
+   ((PAGE_SIZE << order) - sizeof(struct xencomm_desc)) /
+   sizeof(*desc->address);
+   } else {
+   desc = kmalloc(size, gfp_mask);
+   if (desc == NULL)
+   return NULL;
+
+   d

[PATCH 10/11] xen: import include/xen/interface/callback.h which ia64/xen needs.

2008-02-21 Thread Isaku Yamahata


Signed-off-by: Isaku Yamahata <[EMAIL PROTECTED]>
---
 include/xen/interface/callback.h |  119 ++
 1 files changed, 119 insertions(+), 0 deletions(-)
 create mode 100644 include/xen/interface/callback.h

diff --git a/include/xen/interface/callback.h b/include/xen/interface/callback.h
new file mode 100644
index 000..04c8b5d
--- /dev/null
+++ b/include/xen/interface/callback.h
@@ -0,0 +1,119 @@
+/**
+ * callback.h
+ *
+ * Register guest OS callbacks with Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2006, Ian Campbell
+ */
+
+#ifndef __XEN_PUBLIC_CALLBACK_H__
+#define __XEN_PUBLIC_CALLBACK_H__
+
+#include "xen.h"
+
+/*
+ * Prototype for this hypercall is:
+ *   long callback_op(int cmd, void *extra_args)
+ * @cmd== CALLBACKOP_??? (callback operation).
+ * @extra_args == Operation-specific extra arguments (NULL if none).
+ */
+
+/* ia64, x86: Callback for event delivery. */
+#define CALLBACKTYPE_event 0
+
+/* x86: Failsafe callback when guest state cannot be restored by Xen. */
+#define CALLBACKTYPE_failsafe  1
+
+/* x86/64 hypervisor: Syscall by 64-bit guest app ('64-on-64-on-64'). */
+#define CALLBACKTYPE_syscall   2
+
+/*
+ * x86/32 hypervisor: Only available on x86/32 when supervisor_mode_kernel
+ * feature is enabled. Do not use this callback type in new code.
+ */
+#define CALLBACKTYPE_sysenter_deprecated   3
+
+/* x86: Callback for NMI delivery. */
+#define CALLBACKTYPE_nmi   4
+
+/*
+ * x86: sysenter is only available as follows:
+ * - 32-bit hypervisor: with the supervisor_mode_kernel feature enabled
+ * - 64-bit hypervisor: 32-bit guest applications on Intel CPUs
+ *  ('32-on-32-on-64', '32-on-64-on-64')
+ *  [nb. also 64-bit guest applications on Intel CPUs
+ *   ('64-on-64-on-64'), but syscall is preferred]
+ */
+#define CALLBACKTYPE_sysenter  5
+
+/*
+ * x86/64 hypervisor: Syscall by 32-bit guest app on AMD CPUs
+ *('32-on-32-on-64', '32-on-64-on-64')
+ */
+#define CALLBACKTYPE_syscall32 7
+
+/*
+ * Disable event deliver during callback? This flag is ignored for event and
+ * NMI callbacks: event delivery is unconditionally disabled.
+ */
+#define _CALLBACKF_mask_events 0
+#define CALLBACKF_mask_events  (1U << _CALLBACKF_mask_events)
+
+/*
+ * Register a callback.
+ */
+#define CALLBACKOP_register0
+struct callback_register {
+uint16_t type;
+uint16_t flags;
+xen_callback_t address;
+};
+DEFINE_GUEST_HANDLE_STRUCT(callback_register);
+
+/*
+ * Unregister a callback.
+ *
+ * Not all callbacks can be unregistered. -EINVAL will be returned if
+ * you attempt to unregister such a callback.
+ */
+#define CALLBACKOP_unregister  1
+struct callback_unregister {
+uint16_t type;
+uint16_t _unused;
+};
+DEFINE_GUEST_HANDLE_STRUCT(callback_unregister);
+
+#if __XEN_INTERFACE_VERSION__ < 0x00030207
+#undef CALLBACKTYPE_sysenter
+#define CALLBACKTYPE_sysenter CALLBACKTYPE_sysenter_deprecated
+#endif
+
+#endif /* __XEN_PUBLIC_CALLBACK_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
-- 
1.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 08/11] xen: replace callers of alloc_vm_area()/free_vm_area() with xen_ prefixed one.

2008-02-21 Thread Isaku Yamahata

Don't use alloc_vm_area()/free_vm_area() directly, instead define
xen_alloc_vm_area()/xen_free_vm_area() and use them.

alloc_vm_area()/free_vm_area() are used to allocate/free area which
are for grant table mapping. Xen/x86 grant table is based on virtual
address so that alloc_vm_area()/free_vm_area() are suitable.
On the other hand Xen/ia64 (and Xen/powerpc) grant table is based on
pseudo physical address (guest physical address) so that allocation
should be done differently.
The original version of xenified Linux/IA64 have its own
allocate_vm_area()/free_vm_area() definitions which don't allocate vm area
contradictory to those names.
Now vanilla Linux already has its definitions so that it's impossible
to have IA64 definitions of allocate_vm_area()/free_vm_area().
Instead introduce xen_allocate_vm_area()/xen_free_vm_area() and use them.

Signed-off-by: Isaku Yamahata <[EMAIL PROTECTED]>
---
 drivers/xen/grant-table.c  |2 +-
 drivers/xen/xenbus/xenbus_client.c |6 +++---
 include/asm-x86/xen/hypervisor.h   |3 +++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 95016fd..9fcde20 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -478,7 +478,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int 
end_idx)
 
if (shared == NULL) {
struct vm_struct *area;
-   area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames());
+   area = xen_alloc_vm_area(PAGE_SIZE * max_nr_grant_frames());
BUG_ON(area == NULL);
shared = area->addr;
}
diff --git a/drivers/xen/xenbus/xenbus_client.c 
b/drivers/xen/xenbus/xenbus_client.c
index 9fd2f70..0f86b0f 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -399,7 +399,7 @@ int xenbus_map_ring_valloc(struct xenbus_device *dev, int 
gnt_ref, void **vaddr)
 
*vaddr = NULL;
 
-   area = alloc_vm_area(PAGE_SIZE);
+   area = xen_alloc_vm_area(PAGE_SIZE);
if (!area)
return -ENOMEM;
 
@@ -409,7 +409,7 @@ int xenbus_map_ring_valloc(struct xenbus_device *dev, int 
gnt_ref, void **vaddr)
BUG();
 
if (op.status != GNTST_okay) {
-   free_vm_area(area);
+   xen_free_vm_area(area);
xenbus_dev_fatal(dev, op.status,
 "mapping in shared page %d from domain %d",
 gnt_ref, dev->otherend_id);
@@ -508,7 +508,7 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void 
*vaddr)
BUG();
 
if (op.status == GNTST_okay)
-   free_vm_area(area);
+   xen_free_vm_area(area);
else
xenbus_dev_error(dev, op.status,
 "unmapping page at handle %d error %d",
diff --git a/include/asm-x86/xen/hypervisor.h b/include/asm-x86/xen/hypervisor.h
index 138ee8a..31836ad 100644
--- a/include/asm-x86/xen/hypervisor.h
+++ b/include/asm-x86/xen/hypervisor.h
@@ -57,6 +57,9 @@ extern struct shared_info *HYPERVISOR_shared_info;
 extern struct start_info *xen_start_info;
 #define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
 
+#define xen_alloc_vm_area(size)alloc_vm_area(size)
+#define xen_free_vm_area(area) free_vm_area(area)
+
 /* arch/i386/mach-xen/evtchn.c */
 /* Force a proper event-channel callback from Xen. */
 extern void force_evtchn_callback(void);
-- 
1.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 06/11] xen: move arch/x86/xen/events.c undedr drivers/xen and split out arch specific part.

2008-02-21 Thread Isaku Yamahata

ia64/xen also uses events.c. clean it up so that ia64/xen can use.
make ipi_to_irq globly visible. ia64/xen nees to reference it from other file.
introduce resend_irq_on_evtchn() which ia64 needs.
introduce xen_do_IRQ() to split out arch specific code.

Signed-off-by: Isaku Yamahata <[EMAIL PROTECTED]>
---
 arch/x86/xen/Makefile  |2 +-
 drivers/xen/Makefile   |2 +-
 {arch/x86 => drivers}/xen/events.c |   34 ++
 include/asm-x86/xen/hypervisor.h   |7 +++
 include/xen/events.h   |1 +
 5 files changed, 36 insertions(+), 10 deletions(-)
 rename {arch/x86 => drivers}/xen/events.c (95%)

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index c5e9aa4..95c5926 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
 obj-y  := enlighten.o setup.o multicalls.o mmu.o \
-   events.o time.o manage.o xen-asm.o
+   time.o manage.o xen-asm.o
 
 obj-$(CONFIG_SMP)  += smp.o
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 609fdda..823ce78 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,2 +1,2 @@
-obj-y  += grant-table.o features.o
+obj-y  += grant-table.o features.o events.o
 obj-y  += xenbus/
diff --git a/arch/x86/xen/events.c b/drivers/xen/events.c
similarity index 95%
rename from arch/x86/xen/events.c
rename to drivers/xen/events.c
index dcf613e..7474739 100644
--- a/arch/x86/xen/events.c
+++ b/drivers/xen/events.c
@@ -37,7 +37,9 @@
 #include 
 #include 
 
-#include "xen-ops.h"
+#ifdef CONFIG_X86
+# include "../arch/x86/xen/xen-ops.h"
+#endif
 
 /*
  * This lock protects updates to the following mapping and reference-count
@@ -49,7 +51,7 @@ static DEFINE_SPINLOCK(irq_mapping_update_lock);
 static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
 
 /* IRQ <-> IPI mapping */
-static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = 
-1};
+DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
 
 /* Packed IRQ information: binding type, sub-type index, and event channel. */
 struct packed_irq
@@ -455,7 +457,6 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector 
vector)
notify_remote_via_irq(irq);
 }
 
-
 /*
  * Search the CPUs pending events bitmasks.  For each one found, map
  * the event number to an irq, and feed it into do_IRQ() for
@@ -474,7 +475,10 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
 
vcpu_info->evtchn_upcall_pending = 0;
 
-   /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
+   /* Clear master flag /before/ clearing selector flag. */
+   rmb();
+#endif
pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
while (pending_words != 0) {
unsigned long pending_bits;
@@ -486,10 +490,8 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
int port = (word_idx * BITS_PER_LONG) + bit_idx;
int irq = evtchn_to_irq[port];
 
-   if (irq != -1) {
-   regs->orig_ax = ~irq;
-   do_IRQ(regs);
-   }
+   if (irq != -1)
+   xen_do_IRQ(irq, regs);
}
}
 
@@ -525,6 +527,22 @@ static void set_affinity_irq(unsigned irq, cpumask_t dest)
rebind_irq_to_cpu(irq, tcpu);
 }
 
+int resend_irq_on_evtchn(unsigned int irq)
+{
+   int masked, evtchn = evtchn_from_irq(irq);
+   struct shared_info *s = HYPERVISOR_shared_info;
+
+   if (!VALID_EVTCHN(evtchn))
+   return 1;
+
+   masked = sync_test_and_set_bit(evtchn, s->evtchn_mask);
+   sync_set_bit(evtchn, s->evtchn_pending);
+   if (!masked)
+   unmask_evtchn(evtchn);
+
+   return 1;
+}
+
 static void enable_dynirq(unsigned int irq)
 {
int evtchn = evtchn_from_irq(irq);
diff --git a/include/asm-x86/xen/hypervisor.h b/include/asm-x86/xen/hypervisor.h
index 8e15dd2..138ee8a 100644
--- a/include/asm-x86/xen/hypervisor.h
+++ b/include/asm-x86/xen/hypervisor.h
@@ -61,6 +61,13 @@ extern struct start_info *xen_start_info;
 /* Force a proper event-channel callback from Xen. */
 extern void force_evtchn_callback(void);
 
+/* macro to avoid header inclusion dependncy hell */
+#define xen_do_IRQ(irq, regs)  \
+   do {\
+   (regs)->orig_ax = ~(irq);   \
+   do_IRQ(regs);   \
+   } while (0)
+
 /* Turn jiffies into Xen system time. */
 u64 jiffies_to_st(unsigned long jiffies);
 
diff --git a/include/xen/events.h b/include/xen/events.h
index 2bde54d..574cfa4 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -37,6 +3

[PATCH 05/11] xen: move features.c from arch/x86/xen/features.c to drivers/xen.

2008-02-21 Thread Isaku Yamahata

ia64/xen also uses it too, so move it into common place.

Signed-off-by: Isaku Yamahata <[EMAIL PROTECTED]>
---
 arch/x86/xen/Makefile|2 +-
 drivers/xen/Makefile |2 +-
 {arch/x86 => drivers}/xen/features.c |0 
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename {arch/x86 => drivers}/xen/features.c (100%)

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 343df24..c5e9aa4 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
-obj-y  := enlighten.o setup.o features.o multicalls.o mmu.o \
+obj-y  := enlighten.o setup.o multicalls.o mmu.o \
events.o time.o manage.o xen-asm.o
 
 obj-$(CONFIG_SMP)  += smp.o
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 56592f0..609fdda 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,2 +1,2 @@
-obj-y  += grant-table.o
+obj-y  += grant-table.o features.o
 obj-y  += xenbus/
diff --git a/arch/x86/xen/features.c b/drivers/xen/features.c
similarity index 100%
rename from arch/x86/xen/features.c
rename to drivers/xen/features.c
-- 
1.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 03/11] xen: add missing definitions for xen grant table which ia64/xen needs.

2008-02-21 Thread Isaku Yamahata


Signed-off-by: Isaku Yamahata <[EMAIL PROTECTED]>
---
 drivers/xen/grant-table.c   |2 +-
 include/asm-x86/xen/interface.h |   24 
 include/xen/interface/grant_table.h |   11 ---
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index ea94dba..95016fd 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -466,7 +466,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int 
end_idx)
 
setup.dom= DOMID_SELF;
setup.nr_frames  = nr_gframes;
-   setup.frame_list = frames;
+   set_xen_guest_handle(setup.frame_list, frames);
 
rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
if (rc == -ENOSYS) {
diff --git a/include/asm-x86/xen/interface.h b/include/asm-x86/xen/interface.h
index 165c396..49993dd 100644
--- a/include/asm-x86/xen/interface.h
+++ b/include/asm-x86/xen/interface.h
@@ -22,6 +22,30 @@
 #define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
 #define GUEST_HANDLE(name)__guest_handle_ ## name
 
+#ifdef __XEN__
+#if defined(__i386__)
+#define set_xen_guest_handle(hnd, val) \
+   do {\
+   if (sizeof(hnd) == 8)   \
+   *(uint64_t *)&(hnd) = 0;\
+   (hnd).p = val;  \
+   } while (0)
+#elif defined(__x86_64__)
+#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
+#endif
+#else
+#if defined(__i386__)
+#define set_xen_guest_handle(hnd, val) \
+   do {\
+   if (sizeof(hnd) == 8)   \
+   *(uint64_t *)&(hnd) = 0;\
+   (hnd) = val;\
+   } while (0)
+#elif defined(__x86_64__)
+#define set_xen_guest_handle(hnd, val) do { (hnd) = val; } while (0)
+#endif
+#endif
+
 #ifndef __ASSEMBLY__
 /* Guest handles for primitive C types. */
 __DEFINE_GUEST_HANDLE(uchar, unsigned char);
diff --git a/include/xen/interface/grant_table.h 
b/include/xen/interface/grant_table.h
index 2190498..39da93c 100644
--- a/include/xen/interface/grant_table.h
+++ b/include/xen/interface/grant_table.h
@@ -185,6 +185,7 @@ struct gnttab_map_grant_ref {
 grant_handle_t handle;
 uint64_t dev_bus_addr;
 };
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref);
 
 /*
  * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
@@ -206,6 +207,7 @@ struct gnttab_unmap_grant_ref {
 /* OUT parameters. */
 int16_t  status;  /* GNTST_* */
 };
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref);
 
 /*
  * GNTTABOP_setup_table: Set up a grant table for  comprising at least
@@ -223,8 +225,9 @@ struct gnttab_setup_table {
 uint32_t nr_frames;
 /* OUT parameters. */
 int16_t  status;  /* GNTST_* */
-ulong *frame_list;
+GUEST_HANDLE(ulong) frame_list;
 };
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_setup_table);
 
 /*
  * GNTTABOP_dump_table: Dump the contents of the grant table to the
@@ -237,6 +240,7 @@ struct gnttab_dump_table {
 /* OUT parameters. */
 int16_t status;   /* GNTST_* */
 };
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table);
 
 /*
  * GNTTABOP_transfer_grant_ref: Transfer  to a foreign domain. The
@@ -255,7 +259,7 @@ struct gnttab_transfer {
 /* OUT parameters. */
 int16_t   status;
 };
-
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_transfer);
 
 /*
  * GNTTABOP_copy: Hypervisor based copy
@@ -296,6 +300,7 @@ struct gnttab_copy {
/* OUT parameters. */
int16_t   status;
 };
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_copy);
 
 /*
  * GNTTABOP_query_size: Query the current and maximum sizes of the shared
@@ -313,7 +318,7 @@ struct gnttab_query_size {
 uint32_t max_nr_frames;
 int16_t  status;  /* GNTST_* */
 };
-
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
 
 /*
  * Bitfield values for update_pin_status.flags.
-- 
1.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 02/11] xen: add missing VIRQ_ARCH_[0-7] definitions which ia64/xen needs.

2008-02-21 Thread Isaku Yamahata


Signed-off-by: Isaku Yamahata <[EMAIL PROTECTED]>
---
 include/xen/interface/xen.h |   12 +++-
 1 files changed, 11 insertions(+), 1 deletions(-)

diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 87ad143..9b018da 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -78,8 +78,18 @@
 #define VIRQ_CONSOLE2  /* (DOM0) Bytes received on emergency console. */
 #define VIRQ_DOM_EXC3  /* (DOM0) Exceptional event for some domain.   */
 #define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
-#define NR_VIRQS8
 
+/* Architecture-specific VIRQ definitions. */
+#define VIRQ_ARCH_016
+#define VIRQ_ARCH_117
+#define VIRQ_ARCH_218
+#define VIRQ_ARCH_319
+#define VIRQ_ARCH_420
+#define VIRQ_ARCH_521
+#define VIRQ_ARCH_622
+#define VIRQ_ARCH_723
+
+#define NR_VIRQS   24
 /*
  * MMU-UPDATE REQUESTS
  *
-- 
1.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 04/11] xen: add missing definitions in include/xen/interface/vcpu.h which ia64/xen needs

2008-02-21 Thread Isaku Yamahata


Signed-off-by: Isaku Yamahata <[EMAIL PROTECTED]>
---
 include/xen/interface/vcpu.h |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
index b05d8a6..87e6f8a 100644
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -85,6 +85,7 @@ struct vcpu_runstate_info {
 */
uint64_t time[4];
 };
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
 
 /* VCPU is currently running on a physical CPU. */
 #define RUNSTATE_running  0
@@ -119,6 +120,7 @@ struct vcpu_runstate_info {
 #define VCPUOP_register_runstate_memory_area 5
 struct vcpu_register_runstate_memory_area {
union {
+   GUEST_HANDLE(vcpu_runstate_info) h;
struct vcpu_runstate_info *v;
uint64_t p;
} addr;
@@ -134,6 +136,7 @@ struct vcpu_register_runstate_memory_area {
 struct vcpu_set_periodic_timer {
uint64_t period_ns;
 };
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_periodic_timer);
 
 /*
  * Set or stop a VCPU's single-shot timer. Every VCPU has one single-shot
@@ -145,6 +148,7 @@ struct vcpu_set_singleshot_timer {
uint64_t timeout_abs_ns;
uint32_t flags;/* VCPU_SSHOTTMR_??? */
 };
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer);
 
 /* Flags to VCPUOP_set_singleshot_timer. */
  /* Require the timeout to be in the future (return -ETIME if it's passed). */
@@ -164,5 +168,6 @@ struct vcpu_register_vcpu_info {
 uint32_t offset; /* offset within page */
 uint32_t rsvd;   /* unused */
 };
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
 
 #endif /* __XEN_PUBLIC_VCPU_H__ */
-- 
1.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 >

1 - 100 of 103 matches

Mail list logo