We need to retrieve a VM's TSC offset in order to use
the host's TSC to merge host and guest traces. This is
explained in detail in this thread:

  [Qemu-devel] [RFC] host and guest kernel trace merging
  https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg00887.html

Today, the only way to retrieve a VM's TSC offset is
by using the kvm_write_tsc_offset tracepoint. This has
a few problems. First, the tracepoint is only emitted
when the VM boots, which requires a reboot to get it if
the VM is already running. Second, tracepoints are not
supposed to be ABIs in case they need to be consumed by
user-space tools.

This commit exports a VM's TSC offset to user-space via
debugfs. A new file called "tsc-offset" is created in
the VM's debugfs directory. For example:

  /sys/kernel/debug/kvm/51696-10/tsc-offset

This file contains one TSC offset per line, for each
vCPU. For example:

  vcpu0: 18446742405270834952
  vcpu1: 18446742405270834952
  vcpu2: 18446742405270834952
  vcpu3: 18446742405270834952

There are some important observations about this
solution:

 - While all vCPUs TSC offsets should be equal for the
   cases we care about (ie. stable TSC and no write to
   the TSC MSR), I chose to follow the spec and export
   each vCPU's TSC offset (might also be helpful for
   debugging)

 - The TSC offset is only useful after the VM has booted

 - We'll probably need to export the TSC multiplier too.
   However, I've been using only the TSC offset for now.
   So, let's get this merged first and do the TSC multiplier
   as a second step

Signed-off-by: Luiz Capitulino <lcapitul...@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm.c              |  1 +
 arch/x86/kvm/vmx.c              |  8 ++++++++
 arch/x86/kvm/x86.c              | 30 ++++++++++++++++++++++++++++++
 4 files changed, 40 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 33ae3a4..5714bbd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -952,6 +952,7 @@ struct kvm_x86_ops {
        bool (*has_wbinvd_exit)(void);
 
        u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
+       u64 (*read_cached_tsc_offset)(struct kvm_vcpu *vcpu);
        void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
        u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index af523d8..c851477 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5065,6 +5065,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .has_wbinvd_exit = svm_has_wbinvd_exit,
 
        .read_tsc_offset = svm_read_tsc_offset,
+       .read_cached_tsc_offset = svm_read_tsc_offset,
        .write_tsc_offset = svm_write_tsc_offset,
        .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
        .read_l1_tsc = svm_read_l1_tsc,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5cede40..82dfe42 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -616,6 +616,7 @@ struct vcpu_vmx {
        u64 hv_deadline_tsc;
 
        u64 current_tsc_ratio;
+       u64 cached_tsc_offset;
 
        bool guest_pkru_valid;
        u32 guest_pkru;
@@ -2608,6 +2609,11 @@ static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
        return vmcs_read64(TSC_OFFSET);
 }
 
+static u64 vmx_read_cached_tsc_offset(struct kvm_vcpu *vcpu)
+{
+       return to_vmx(vcpu)->cached_tsc_offset;
+}
+
 /*
  * writes 'offset' into guest's timestamp counter offset register
  */
@@ -2632,6 +2638,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, 
u64 offset)
                                           vmcs_read64(TSC_OFFSET), offset);
                vmcs_write64(TSC_OFFSET, offset);
        }
+       to_vmx(vcpu)->cached_tsc_offset = offset;
 }
 
 static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -11275,6 +11282,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
        .read_tsc_offset = vmx_read_tsc_offset,
+       .read_cached_tsc_offset = vmx_read_cached_tsc_offset,
        .write_tsc_offset = vmx_write_tsc_offset,
        .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
        .read_l1_tsc = vmx_read_l1_tsc,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 18dfbac..75a8e23 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -54,6 +54,7 @@
 #include <linux/pvclock_gtod.h>
 #include <linux/kvm_irqfd.h>
 #include <linux/irqbypass.h>
+#include <linux/debugfs.h>
 #include <trace/events/kvm.h>
 
 #include <asm/debugreg.h>
@@ -7779,8 +7780,37 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        return 0;
 }
 
+static int tsc_offset_show(struct seq_file *m, void *data)
+{
+       struct kvm *kvm = m->private;
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               seq_printf(m, "vcpu%d: %llu\n",
+                               vcpu->vcpu_id, 
kvm_x86_ops->read_cached_tsc_offset(vcpu));
+
+       return 0;
+}
+
+static int tsc_offset_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, tsc_offset_show, inode->i_private);
+}
+
+static const struct file_operations tsc_offset_fops = {
+       .owner          =       THIS_MODULE,
+       .open           =       tsc_offset_open,
+       .read           =       seq_read,
+       .llseek         =       seq_lseek,
+       .release        =       single_release,
+};
+
 int kvm_arch_create_vm_debugfs(struct kvm *kvm)
 {
+       if (!debugfs_create_file("tsc-offset", 0444,
+                               kvm->debugfs_dentry, kvm, &tsc_offset_fops))
+               return -ENOMEM;
        return 0;
 }
 
-- 
2.5.5

Reply via email to