Vivek Goyal <[email protected]> writes:

> As of now asynchronous page fault mecahanism assumes host will always be
> successful in resolving page fault. So there are only two states, that
> is page is not present and page is ready.
>
> If a page is backed by a file and that file has been truncated (as
> can be the case with virtio-fs), then page fault handler on host returns
> -EFAULT.
>
> As of now async page fault logic does not look at error code (-EFAULT)
> returned by get_user_pages_remote() and returns PAGE_READY to guest.
> Guest tries to access page and page fault happnes again. And this
> gets kvm into an infinite loop. (Killing host process gets kvm out of
> this loop though).
>
> This patch adds another state to async page fault logic which allows
> host to return error to guest. Once guest knows that async page fault
> can't be resolved, it can send SIGBUS to host process (if user space
> was accessing the page in question).
>
> Signed-off-by: Vivek Goyal <[email protected]>
> ---
>  Documentation/virt/kvm/cpuid.rst     |  4 +++
>  Documentation/virt/kvm/msr.rst       | 10 +++++---
>  arch/x86/include/asm/kvm_host.h      |  3 +++
>  arch/x86/include/asm/kvm_para.h      |  8 +++---
>  arch/x86/include/uapi/asm/kvm_para.h | 10 ++++++--
>  arch/x86/kernel/kvm.c                | 34 ++++++++++++++++++++-----
>  arch/x86/kvm/cpuid.c                 |  3 ++-
>  arch/x86/kvm/mmu/mmu.c               |  2 +-
>  arch/x86/kvm/x86.c                   | 38 ++++++++++++++++++++--------
>  9 files changed, 83 insertions(+), 29 deletions(-)
>
> diff --git a/Documentation/virt/kvm/cpuid.rst 
> b/Documentation/virt/kvm/cpuid.rst
> index a7dff9186bed..a4497f3a6e7f 100644
> --- a/Documentation/virt/kvm/cpuid.rst
> +++ b/Documentation/virt/kvm/cpuid.rst
> @@ -92,6 +92,10 @@ KVM_FEATURE_ASYNC_PF_INT          14          guest checks 
> this feature bit
>                                                async pf acknowledgment msr
>                                                0x4b564d07.
>  
> +KVM_FEATURE_ASYNC_PF_ERROR        15          paravirtualized async PF error
> +                                              can be enabled by setting bit 4
> +                                              when writing to msr 0x4b564d02
> +
>  KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24          host will warn if no guest-side
>                                                per-cpu warps are expeced in
>                                                kvmclock
> diff --git a/Documentation/virt/kvm/msr.rst b/Documentation/virt/kvm/msr.rst
> index e37a14c323d2..513eb8e0b0eb 100644
> --- a/Documentation/virt/kvm/msr.rst
> +++ b/Documentation/virt/kvm/msr.rst
> @@ -202,19 +202,21 @@ data:
>  
>               /* Used for 'page ready' events delivered via interrupt 
> notification */
>               __u32 token;
> -
> -             __u8 pad[56];
> +                __u32 ready_flags;
> +             __u8 pad[52];
>               __u32 enabled;
>         };
>  
> -     Bits 5-4 of the MSR are reserved and should be zero. Bit 0 is set to 1
> +     Bits 5 of the MSR is reserved and should be zero. Bit 0 is set to 1
>       when asynchronous page faults are enabled on the vcpu, 0 when disabled.
>       Bit 1 is 1 if asynchronous page faults can be injected when vcpu is in
>       cpl == 0. Bit 2 is 1 if asynchronous page faults are delivered to L1 as
>       #PF vmexits.  Bit 2 can be set only if KVM_FEATURE_ASYNC_PF_VMEXIT is
>       present in CPUID. Bit 3 enables interrupt based delivery of 'page ready'
>       events. Bit 3 can only be set if KVM_FEATURE_ASYNC_PF_INT is present in
> -     CPUID.
> +     CPUID. Bit 4 enables reporting of page fault errors if hypervisor
> +        encounters errors while faulting in page. Bit 4 can only be set if
> +        KVM_FEATURE_ASYNC_PF_ERROR is present in CPUID.

Would it actually make sense to deliver the exact error from
get_user_pages() and not a binary failure/no-failure? Is the guest
interested in how exactly we failed?

>  
>       'Page not present' events are currently always delivered as synthetic
>       #PF exception. During delivery of these events APF CR2 register contains
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 348a73106556..ff8dbc604dbe 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -779,6 +779,7 @@ struct kvm_vcpu_arch {
>               bool delivery_as_pf_vmexit;
>               bool pageready_pending;
>               gfn_t error_gfn;
> +             bool send_pf_error;
>       } apf;
>  
>       /* OSVW MSRs (AMD only) */
> @@ -1680,6 +1681,8 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
>                              struct kvm_async_pf *work);
>  void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu);
>  bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
> +void kvm_arch_async_page_fault_error(struct kvm_vcpu *vcpu,
> +                                  struct kvm_async_pf *work);
>  extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
>  
>  int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
> index bbc43e5411d9..6c738e91ca2c 100644
> --- a/arch/x86/include/asm/kvm_para.h
> +++ b/arch/x86/include/asm/kvm_para.h
> @@ -89,8 +89,8 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned 
> long p1,
>  bool kvm_para_available(void);
>  unsigned int kvm_arch_para_features(void);
>  unsigned int kvm_arch_para_hints(void);
> -void kvm_async_pf_task_wait_schedule(u32 token);
> -void kvm_async_pf_task_wake(u32 token);
> +void kvm_async_pf_task_wait_schedule(u32 token, bool user_mode);
> +void kvm_async_pf_task_wake(u32 token, bool is_err);
>  u32 kvm_read_and_reset_apf_flags(void);
>  void kvm_disable_steal_time(void);
>  bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
> @@ -120,8 +120,8 @@ static inline void kvm_spinlock_init(void)
>  #endif /* CONFIG_PARAVIRT_SPINLOCKS */
>  
>  #else /* CONFIG_KVM_GUEST */
> -#define kvm_async_pf_task_wait_schedule(T) do {} while(0)
> -#define kvm_async_pf_task_wake(T) do {} while(0)
> +#define kvm_async_pf_task_wait_schedule(T, U) do {} while(0)
> +#define kvm_async_pf_task_wake(T, I) do {} while(0)
>  
>  static inline bool kvm_para_available(void)
>  {
> diff --git a/arch/x86/include/uapi/asm/kvm_para.h 
> b/arch/x86/include/uapi/asm/kvm_para.h
> index 812e9b4c1114..b43fd2ddc949 100644
> --- a/arch/x86/include/uapi/asm/kvm_para.h
> +++ b/arch/x86/include/uapi/asm/kvm_para.h
> @@ -32,6 +32,7 @@
>  #define KVM_FEATURE_POLL_CONTROL     12
>  #define KVM_FEATURE_PV_SCHED_YIELD   13
>  #define KVM_FEATURE_ASYNC_PF_INT     14
> +#define KVM_FEATURE_ASYNC_PF_ERROR   15
>  
>  #define KVM_HINTS_REALTIME      0
>  
> @@ -85,6 +86,7 @@ struct kvm_clock_pairing {
>  #define KVM_ASYNC_PF_SEND_ALWAYS             (1 << 1)
>  #define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT   (1 << 2)
>  #define KVM_ASYNC_PF_DELIVERY_AS_INT         (1 << 3)
> +#define KVM_ASYNC_PF_SEND_ERROR                      (1 << 4)
>  
>  /* MSR_KVM_ASYNC_PF_INT */
>  #define KVM_ASYNC_PF_VEC_MASK                        GENMASK(7, 0)
> @@ -119,14 +121,18 @@ struct kvm_mmu_op_release_pt {
>  #define KVM_PV_REASON_PAGE_NOT_PRESENT 1
>  #define KVM_PV_REASON_PAGE_READY 2
>  
> +
> +/* Bit flags for ready_flags field */
> +#define KVM_PV_REASON_PAGE_ERROR 1
> +
>  struct kvm_vcpu_pv_apf_data {
>       /* Used for 'page not present' events delivered via #PF */
>       __u32 flags;
>  
>       /* Used for 'page ready' events delivered via interrupt notification */
>       __u32 token;
> -
> -     __u8 pad[56];
> +     __u32 ready_flags;
> +     __u8 pad[52];
>       __u32 enabled;
>  };
>  
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index ff95429d206b..2ee9c9d971ae 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -75,6 +75,7 @@ struct kvm_task_sleep_node {
>       struct swait_queue_head wq;
>       u32 token;
>       int cpu;
> +     bool is_err;
>  };
>  
>  static struct kvm_task_sleep_head {
> @@ -97,7 +98,14 @@ static struct kvm_task_sleep_node *_find_apf_task(struct 
> kvm_task_sleep_head *b,
>       return NULL;
>  }
>  
> -static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
> +static void handle_async_pf_error(bool user_mode)
> +{
> +     if (user_mode)
> +             send_sig_info(SIGBUS, SEND_SIG_PRIV, current);
> +}
> +
> +static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n,
> +                                 bool user_mode)
>  {
>       u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
>       struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
> @@ -107,6 +115,8 @@ static bool kvm_async_pf_queue_task(u32 token, struct 
> kvm_task_sleep_node *n)
>       e = _find_apf_task(b, token);
>       if (e) {
>               /* dummy entry exist -> wake up was delivered ahead of PF */
> +             if (e->is_err)
> +                     handle_async_pf_error(user_mode);
>               hlist_del(&e->link);
>               raw_spin_unlock(&b->lock);
>               kfree(e);
> @@ -128,14 +138,14 @@ static bool kvm_async_pf_queue_task(u32 token, struct 
> kvm_task_sleep_node *n)
>   * Invoked from the async pagefault handling code or from the VM exit page
>   * fault handler. In both cases RCU is watching.
>   */
> -void kvm_async_pf_task_wait_schedule(u32 token)
> +void kvm_async_pf_task_wait_schedule(u32 token, bool user_mode)
>  {
>       struct kvm_task_sleep_node n;
>       DECLARE_SWAITQUEUE(wait);
>  
>       lockdep_assert_irqs_disabled();
>  
> -     if (!kvm_async_pf_queue_task(token, &n))
> +     if (!kvm_async_pf_queue_task(token, &n, user_mode))
>               return;
>  
>       for (;;) {
> @@ -148,6 +158,8 @@ void kvm_async_pf_task_wait_schedule(u32 token)
>               local_irq_disable();
>       }
>       finish_swait(&n.wq, &wait);
> +     if (n.is_err)
> +             handle_async_pf_error(user_mode);
>  }
>  EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
>  
> @@ -177,7 +189,7 @@ static void apf_task_wake_all(void)
>       }
>  }
>  
> -void kvm_async_pf_task_wake(u32 token)
> +void kvm_async_pf_task_wake(u32 token, bool is_err)
>  {
>       u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
>       struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
> @@ -208,9 +220,11 @@ void kvm_async_pf_task_wake(u32 token)
>               }
>               n->token = token;
>               n->cpu = smp_processor_id();
> +             n->is_err = is_err;
>               init_swait_queue_head(&n->wq);
>               hlist_add_head(&n->link, &b->list);
>       } else {
> +             n->is_err = is_err;
>               apf_task_wake_one(n);
>       }
>       raw_spin_unlock(&b->lock);
> @@ -256,14 +270,15 @@ bool __kvm_handle_async_pf(struct pt_regs *regs, u32 
> token)
>               panic("Host injected async #PF in kernel mode\n");
>  
>       /* Page is swapped out by the host. */
> -     kvm_async_pf_task_wait_schedule(token);
> +     kvm_async_pf_task_wait_schedule(token, user_mode(regs));
>       return true;
>  }
>  NOKPROBE_SYMBOL(__kvm_handle_async_pf);
>  
>  __visible void __irq_entry kvm_async_pf_intr(struct pt_regs *regs)
>  {
> -     u32 token;
> +     u32 token, ready_flags;
> +     bool is_err;
>  
>       entering_ack_irq();
>  
> @@ -271,7 +286,9 @@ __visible void __irq_entry kvm_async_pf_intr(struct 
> pt_regs *regs)
>  
>       if (__this_cpu_read(apf_reason.enabled)) {
>               token = __this_cpu_read(apf_reason.token);
> -             kvm_async_pf_task_wake(token);
> +             ready_flags = __this_cpu_read(apf_reason.ready_flags);
> +             is_err = ready_flags & KVM_PV_REASON_PAGE_ERROR;
> +             kvm_async_pf_task_wake(token, is_err);
>               __this_cpu_write(apf_reason.token, 0);
>               wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
>       }
> @@ -335,6 +352,9 @@ static void kvm_guest_cpu_init(void)
>  
>               wrmsrl(MSR_KVM_ASYNC_PF_INT, KVM_ASYNC_PF_VECTOR);
>  
> +             if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_ERROR))
> +                     pa |= KVM_ASYNC_PF_SEND_ERROR;
> +
>               wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
>               __this_cpu_write(apf_reason.enabled, 1);
>               pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 253b8e875ccd..f811f36e0c24 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -712,7 +712,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array 
> *array, u32 function)
>                            (1 << KVM_FEATURE_PV_SEND_IPI) |
>                            (1 << KVM_FEATURE_POLL_CONTROL) |
>                            (1 << KVM_FEATURE_PV_SCHED_YIELD) |
> -                          (1 << KVM_FEATURE_ASYNC_PF_INT);
> +                          (1 << KVM_FEATURE_ASYNC_PF_INT) |
> +                          (1 << KVM_FEATURE_ASYNC_PF_ERROR);
>  
>               if (sched_info_on())
>                       entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e207900ab303..634182bb07c7 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4175,7 +4175,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 
> error_code,
>       } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
>               vcpu->arch.apf.host_apf_flags = 0;
>               local_irq_disable();
> -             kvm_async_pf_task_wait_schedule(fault_address);
> +             kvm_async_pf_task_wait_schedule(fault_address, 0);
>               local_irq_enable();
>       } else {
>               WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 4c5205434b9c..c3b2097f2eaa 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2690,8 +2690,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu 
> *vcpu, u64 data)
>  {
>       gpa_t gpa = data & ~0x3f;
>  
> -     /* Bits 4:5 are reserved, Should be zero */
> -     if (data & 0x30)
> +     /* Bits 5 is reserved, Should be zero */
> +     if (data & 0x20)
>               return 1;
>  
>       vcpu->arch.apf.msr_en_val = data;
> @@ -2703,11 +2703,12 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu 
> *vcpu, u64 data)
>       }
>  
>       if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
> -                                     sizeof(u64)))
> +                                     sizeof(u64) + sizeof(u32)))
>               return 1;
>  
>       vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
>       vcpu->arch.apf.delivery_as_pf_vmexit = data & 
> KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
> +     vcpu->arch.apf.send_pf_error = data & KVM_ASYNC_PF_SEND_ERROR;
>  
>       kvm_async_pf_wakeup_all(vcpu);
>  
> @@ -10481,12 +10482,25 @@ static inline int apf_put_user_notpresent(struct 
> kvm_vcpu *vcpu)
>                                     sizeof(reason));
>  }
>  
> -static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
> +static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token,
> +                                  bool is_err)
>  {
>       unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
> +     int ret;
> +     u32 ready_flags = 0;
> +
> +     if (is_err && vcpu->arch.apf.send_pf_error)
> +             ready_flags = KVM_PV_REASON_PAGE_ERROR;
> +
> +     ret = kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
> +                                         &token, offset, sizeof(token));
> +     if (ret < 0)
> +             return ret;
>  
> +     offset = offsetof(struct kvm_vcpu_pv_apf_data, ready_flags);
>       return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
> -                                          &token, offset, sizeof(token));
> +                                         &ready_flags, offset,
> +                                         sizeof(ready_flags));
>  }
>  
>  static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
> @@ -10571,6 +10585,8 @@ bool kvm_arch_async_page_not_present(struct kvm_vcpu 
> *vcpu,
>  void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
>                                struct kvm_async_pf *work)
>  {
> +     bool present_injected = false;
> +
>       struct kvm_lapic_irq irq = {
>               .delivery_mode = APIC_DM_FIXED,
>               .vector = vcpu->arch.apf.vec
> @@ -10584,16 +10600,18 @@ void kvm_arch_async_page_present(struct kvm_vcpu 
> *vcpu,
>  
>       if ((work->wakeup_all || work->notpresent_injected) &&
>           kvm_pv_async_pf_enabled(vcpu) &&
> -         !apf_put_user_ready(vcpu, work->arch.token)) {
> +         !apf_put_user_ready(vcpu, work->arch.token, work->error_code)) {
>               vcpu->arch.apf.pageready_pending = true;
>               kvm_apic_set_irq(vcpu, &irq, NULL);
> +             present_injected = true;
>       }
>  
> -     if (work->error_code) {
> +     if (work->error_code &&
> +         (!present_injected || !vcpu->arch.apf.send_pf_error)) {
>               /*
> -              * Page fault failed and we don't have a way to report
> -              * errors back to guest yet. So save this pfn and force
> -              * sync page fault next time.
> +              * Page fault failed. If we did not report error back
> +              * to guest, save this pfn and force sync page fault next
> +              * time.
>                */
>               vcpu->arch.apf.error_gfn = work->cr2_or_gpa >> PAGE_SHIFT;
>       }

-- 
Vitaly

Reply via email to