Ensure Intel PT tracing is disabled before VM-Entry in Intel PT Host/Guest
mode.

Intel PT has 2 modes for tracing virtual machines. The default is System
mode whereby host and guest output to the host trace buffer. The other is
Host/Guest mode whereby host and guest output to their own buffers.
Host/Guest mode is selected by kvm_intel module parameter pt_mode=1.

In Host/Guest mode, the following rule must be followed:

        If the logical processor is operating with Intel PT enabled
        (if IA32_RTIT_CTL.TraceEn = 1) at the time of VM entry, the
        "load IA32_RTIT_CTL" VM-entry control must be 0.

However, "load IA32_RTIT_CTL" VM-entry control is always 1 in Host/Guest
mode, so IA32_RTIT_CTL.TraceEn must always be 0 at VM entry, irrespective
of whether guest IA32_RTIT_CTL.TraceEn is 1.

Fix by stopping host Intel PT tracing always at VM entry in Host/Guest
mode. That also fixes the issue whereby the Intel PT NMI handler would
set IA32_RTIT_CTL.TraceEn back to 1 after KVM has just set it to 0.

Fixes: 2ef444f1600b ("KVM: x86: Add Intel PT context switch for each vcpu")
Cc: sta...@vger.kernel.org
Signed-off-by: Adrian Hunter <adrian.hun...@intel.com>
---
 arch/x86/events/intel/pt.c      | 131 +++++++++++++++++++++++++++++++-
 arch/x86/events/intel/pt.h      |  10 +++
 arch/x86/include/asm/intel_pt.h |   4 +
 arch/x86/kvm/vmx/vmx.c          |  23 ++----
 arch/x86/kvm/vmx/vmx.h          |   1 -
 5 files changed, 147 insertions(+), 22 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index a087bc0c5498..d9469d2d6aa6 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -480,16 +480,20 @@ static u64 pt_config_filters(struct perf_event *event)
                 */
 
                /* avoid redundant msr writes */
-               if (pt->filters.filter[range].msr_a != filter->msr_a) {
+               if (pt->filters.filter[range].msr_a != filter->msr_a ||
+                   pt->write_filter_msrs[range]) {
                        wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
                        pt->filters.filter[range].msr_a = filter->msr_a;
                }
 
-               if (pt->filters.filter[range].msr_b != filter->msr_b) {
+               if (pt->filters.filter[range].msr_b != filter->msr_b ||
+                   pt->write_filter_msrs[range]) {
                        wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
                        pt->filters.filter[range].msr_b = filter->msr_b;
                }
 
+               pt->write_filter_msrs[range] = false;
+
                rtit_ctl |= (u64)filter->config << 
pt_address_ranges[range].reg_off;
        }
 
@@ -534,6 +538,11 @@ static void pt_config(struct perf_event *event)
        reg |= (event->attr.config & PT_CONFIG_MASK);
 
        event->hw.aux_config = reg;
+
+       /* Configuration is complete, it is now OK to handle an NMI */
+       barrier();
+       WRITE_ONCE(pt->handle_nmi, 1);
+
        pt_config_start(event);
 }
 
@@ -950,6 +959,7 @@ static void pt_handle_status(struct pt *pt)
                pt_buffer_advance(buf);
 
        wrmsrl(MSR_IA32_RTIT_STATUS, status);
+       pt->status = status;
 }
 
 /**
@@ -1588,7 +1598,6 @@ static void pt_event_start(struct perf_event *event, int 
mode)
                        goto fail_end_stop;
        }
 
-       WRITE_ONCE(pt->handle_nmi, 1);
        hwc->state = 0;
 
        pt_config_buffer(buf);
@@ -1643,6 +1652,104 @@ static void pt_event_stop(struct perf_event *event, int 
mode)
        }
 }
 
+#define PT_VM_NO_TRANSITION    0
+#define PT_VM_ENTRY            1
+#define PT_VM_EXIT             2
+
+void intel_pt_vm_entry(bool guest_trace_enable)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       struct perf_event *event;
+
+       pt->restart_event = NULL;
+       pt->stashed_buf_sz = 0;
+
+       WRITE_ONCE(pt->vm_transition, PT_VM_ENTRY);
+       barrier();
+
+       if (READ_ONCE(pt->handle_nmi)) {
+               /* Must stop handler before reading pt->handle.event */
+               WRITE_ONCE(pt->handle_nmi, 0);
+               barrier();
+               event = pt->handle.event;
+               if (event && !event->hw.state) {
+                       struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+                       if (buf && buf->snapshot)
+                               pt->stashed_buf_sz = buf->nr_pages << 
PAGE_SHIFT;
+                       pt->restart_event = event;
+                       pt_event_stop(event, PERF_EF_UPDATE);
+               }
+       }
+
+       /*
+        * If guest_trace_enable, MSRs need to be saved, but the values are
+        * either already cached or not needed:
+        *      MSR_IA32_RTIT_CTL               event->hw.aux_config
+        *      MSR_IA32_RTIT_STATUS            pt->status
+        *      MSR_IA32_RTIT_CR3_MATCH         not used
+        *      MSR_IA32_RTIT_OUTPUT_BASE       pt->output_base
+        *      MSR_IA32_RTIT_OUTPUT_MASK       pt->output_mask
+        *      MSR_IA32_RTIT_ADDR...           pt->filters
+        */
+}
+EXPORT_SYMBOL_GPL(intel_pt_vm_entry);
+
+void intel_pt_vm_exit(bool guest_trace_enable)
+{
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
+       u64 base = pt->output_base;
+       u64 mask = pt->output_mask;
+
+       WRITE_ONCE(pt->vm_transition, PT_VM_EXIT);
+       barrier();
+
+       /*
+        * If guest_trace_enable, MSRs need to be restored, but that is handled
+        * in different ways:
+        *      MSR_IA32_RTIT_CTL               written next start
+        *      MSR_IA32_RTIT_STATUS            restored below
+        *      MSR_IA32_RTIT_CR3_MATCH         not used
+        *      MSR_IA32_RTIT_OUTPUT_BASE       written next start or restored
+        *                                      further below
+        *      MSR_IA32_RTIT_OUTPUT_MASK       written next start or restored
+        *                                      further below
+        *      MSR_IA32_RTIT_ADDR...           flagged to be written when
+        *                                      needed
+        */
+       if (guest_trace_enable) {
+               wrmsrl(MSR_IA32_RTIT_STATUS, pt->status);
+               /*
+                * Force address filter MSR writes during reconfiguration,
+                * refer pt_config_filters().
+                */
+               for (int range = 0; range < PT_FILTERS_NUM; range++)
+                       pt->write_filter_msrs[range] = true;
+       }
+
+       if (pt->restart_event) {
+               if (guest_trace_enable) {
+                       /* Invalidate to force buffer reconfiguration */
+                       pt->output_base = ~0ULL;
+                       pt->output_mask = 0;
+               }
+               pt_event_start(pt->restart_event, 0);
+               pt->restart_event = NULL;
+       }
+
+       /* If tracing wasn't started, restore buffer configuration */
+       if (guest_trace_enable && !READ_ONCE(pt->handle_nmi)) {
+               wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base);
+               wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, mask);
+               pt->output_base = base;
+               pt->output_mask = mask;
+       }
+
+       barrier();
+       WRITE_ONCE(pt->vm_transition, PT_VM_NO_TRANSITION);
+}
+EXPORT_SYMBOL_GPL(intel_pt_vm_exit);
+
 static long pt_event_snapshot_aux(struct perf_event *event,
                                  struct perf_output_handle *handle,
                                  unsigned long size)
@@ -1651,6 +1758,24 @@ static long pt_event_snapshot_aux(struct perf_event 
*event,
        struct pt_buffer *buf = perf_get_aux(&pt->handle);
        unsigned long from = 0, to;
        long ret;
+       int tr;
+
+       /*
+        * Special handling during VM transition. At VM-Entry stage, once
+        * tracing is stopped, as indicated by buf == NULL, snapshot using the
+        * saved head position. At VM-Exit do that also until tracing is
+        * reconfigured as indicated by handle_nmi.
+        */
+       tr = READ_ONCE(pt->vm_transition);
+       if ((tr == PT_VM_ENTRY && !buf) || (tr == PT_VM_EXIT && 
!READ_ONCE(pt->handle_nmi))) {
+               if (WARN_ON_ONCE(!pt->stashed_buf_sz))
+                       return 0;
+               to = pt->handle.head;
+               if (to < size)
+                       from = pt->stashed_buf_sz;
+               from += to - size;
+               return perf_output_copy_aux(&pt->handle, handle, from, to);
+       }
 
        if (WARN_ON_ONCE(!buf))
                return 0;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index a1b6c04b7f68..0428019b92f4 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -121,6 +121,11 @@ struct pt_filters {
  * @vmx_on:            1 if VMX is ON on this cpu
  * @output_base:       cached RTIT_OUTPUT_BASE MSR value
  * @output_mask:       cached RTIT_OUTPUT_MASK MSR value
+ * @status:            cached RTIT_STATUS MSR value
+ * @vm_transition:     VM transition (snapshot_aux needs special handling)
+ * @write_filter_msrs: write address filter MSRs during configuration
+ * @stashed_buf_sz:    buffer size during VM transition
+ * @restart_event:     event to restart after VM-Exit
  */
 struct pt {
        struct perf_output_handle handle;
@@ -129,6 +134,11 @@ struct pt {
        int                     vmx_on;
        u64                     output_base;
        u64                     output_mask;
+       u64                     status;
+       int                     vm_transition;
+       bool                    write_filter_msrs[PT_FILTERS_NUM];
+       unsigned long           stashed_buf_sz;
+       struct perf_event       *restart_event;
 };
 
 #endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index c796e9bc98b6..a673ac3a825e 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -30,11 +30,15 @@ enum pt_capabilities {
 void cpu_emergency_stop_pt(void);
 extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap);
 extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap);
+extern void intel_pt_vm_entry(bool guest_trace_enable);
+extern void intel_pt_vm_exit(bool guest_trace_enable);
 extern int is_intel_pt_event(struct perf_event *event);
 #else
 static inline void cpu_emergency_stop_pt(void) {}
 static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 
0; }
 static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities 
capability) { return 0; }
+static inline void intel_pt_vm_entry(bool guest_trace_enable) {}
+static inline void intel_pt_vm_exit(bool guest_trace_enable) {}
 static inline int is_intel_pt_event(struct perf_event *event) { return 0; }
 #endif
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index eaf4965ac6df..9998da4e774d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1220,16 +1220,10 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
        if (vmx_pt_mode_is_system())
                return;
 
-       /*
-        * GUEST_IA32_RTIT_CTL is already set in the VMCS.
-        * Save host state before VM entry.
-        */
-       rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
-       if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
-               wrmsrl(MSR_IA32_RTIT_CTL, 0);
-               pt_save_msr(&vmx->pt_desc.host, 
vmx->pt_desc.num_address_ranges);
+       intel_pt_vm_entry(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
+
+       if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN)
                pt_load_msr(&vmx->pt_desc.guest, 
vmx->pt_desc.num_address_ranges);
-       }
 }
 
 static void pt_guest_exit(struct vcpu_vmx *vmx)
@@ -1237,17 +1231,10 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
        if (vmx_pt_mode_is_system())
                return;
 
-       if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+       if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN)
                pt_save_msr(&vmx->pt_desc.guest, 
vmx->pt_desc.num_address_ranges);
-               pt_load_msr(&vmx->pt_desc.host, 
vmx->pt_desc.num_address_ranges);
-       }
 
-       /*
-        * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
-        * i.e. RTIT_CTL is always cleared on VM-Exit.  Restore it if necessary.
-        */
-       if (vmx->pt_desc.host.ctl)
-               wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+       intel_pt_vm_exit(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
 }
 
 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 2325f773a20b..24ac6f6dc0ca 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -63,7 +63,6 @@ struct pt_desc {
        u64 ctl_bitmask;
        u32 num_address_ranges;
        u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
-       struct pt_ctx host;
        struct pt_ctx guest;
 };
 
-- 
2.43.0


Reply via email to