[PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

Zhang, Yanmin Mon, 21 Jun 2010 02:34:03 -0700

The 3rd patch is to implement para virt perf at host kernel.

Signed-off-by: Zhang Yanmin <yanmin_zh...@linux.intel.com>


---

--- linux-2.6_tip0620/arch/x86/include/asm/kvm_para.h   2010-06-21 
15:19:38.992999849 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/include/asm/kvm_para.h    2010-06-21 
15:21:39.308999849 +0800
@@ -2,6 +2,7 @@
 #define _ASM_X86_KVM_PARA_H
 
 #include <linux/types.h>
+#include <linux/list.h>
 #include <asm/hyperv.h>
 
 /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx.  It
@@ -19,7 +20,8 @@
 /* This indicates that the new set of kvmclock msrs
  * are available. The use of 0x11 and 0x12 is deprecated
  */
-#define KVM_FEATURE_CLOCKSOURCE2        3
+#define KVM_FEATURE_CLOCKSOURCE2       3
+#define KVM_FEATURE_PV_PERF            4
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -33,7 +35,14 @@
 #define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
 
-#define KVM_MAX_MMU_OP_BATCH           32
+#define KVM_MAX_MMU_OP_BATCH           32
+
+/* Operations for KVM_PERF_OP */
+#define KVM_PERF_OP_OPEN               1
+#define KVM_PERF_OP_CLOSE              2
+#define KVM_PERF_OP_ENABLE             3
+#define KVM_PERF_OP_DISABLE            4
+#define KVM_PERF_OP_READ               5
 
 /* Operations for KVM_HC_MMU_OP */
 #define KVM_MMU_OP_WRITE_PTE            1
@@ -64,6 +73,85 @@ struct kvm_mmu_op_release_pt {
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
+/*
+ * data communication area about perf_event between
+ * Host kernel and guest kernel
+ */
+struct guest_perf_event {
+       u64 count;
+       atomic_t overflows;
+};
+
+/*
+ * In host kernel, perf_event->host_perf_shadow points to
+ * host_perf_shadow which records some information
+ * about the guest.
+ */
+struct host_perf_shadow {
+       /* guest perf_event id passed from guest os */
+       int id;
+       /*
+        * Host kernel saves data into data member counter firstly.
+        * kvm will get data from this counter and calls kvm functions
+        * to copy or add data back to guets os before entering guest os
+        * next time
+        */
+       struct guest_perf_event counter;
+       /* guest_event_addr is gpa_t pointing to guest os guest_perf_event*/
+       __u64 guest_event_addr;
+
+       /*
+        * Link to  of kvm.kvm_arch.shadow_hash_table
+        */
+       struct list_head shadow_entry;
+       struct kvm_vcpu *vcpu;
+
+       struct perf_event *host_event;
+       /*
+        * Below counter is to prevent malicious guest os to try to
+        * close/enable event at the same time.
+        */
+       atomic_t ref_counter;
+};
+
+/*
+ * In guest kernel, perf_event->guest_shadow points to
+ * guest_perf_shadow which records some information
+ * about the guest.
+ */
+struct guest_perf_shadow {
+       /* guest perf_event id passed from guest os */
+       int id;
+       /*
+        * Host kernel kvm saves data into data member counter
+        */
+       struct guest_perf_event counter;
+};
+
+/*
+ * guest_perf_attr is used when guest calls hypercall to
+ * open a new perf_event at host side. Mostly, it's a copy of
+ * perf_event_attr and deletes something not used by host kernel.
+ */
+struct guest_perf_attr {
+       __u32                   type;
+       __u64                   config;
+       __u64                   sample_period;
+       __u64                   sample_type;
+       __u64                   read_format;
+       __u64                   flags;
+       __u32                   bp_type;
+       __u64                   bp_addr;
+       __u64                   bp_len;
+};
+
+struct guest_perf_event_param {
+       __u64 attr_addr;
+       __u64 guest_event_addr;
+       /* In case there is an alignment issue, we put id as the last one */
+       int id;
+};
+
 extern void kvmclock_init(void);
 
 
--- linux-2.6_tip0620/arch/x86/include/asm/kvm_host.h   2010-06-21 
15:19:39.019999849 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/include/asm/kvm_host.h    2010-06-21 
15:21:39.308999849 +0800
@@ -24,6 +24,7 @@
 #include <asm/desc.h>
 #include <asm/mtrr.h>
 #include <asm/msr-index.h>
+#include <asm/perf_event.h>
 
 #define KVM_MAX_VCPUS 64
 #define KVM_MEMORY_SLOTS 32
@@ -360,6 +361,18 @@ struct kvm_vcpu_arch {
 
        /* fields used by HYPER-V emulation */
        u64 hv_vapic;
+
+       /*
+        * Fields used by PARAVIRT perf interface:
+        *
+        * kvm checks overflow_events before entering guest os,
+        * and copy data back to guest os.
+        * event_mutex is to avoid a race between NMI perf event overflow
+        * handler, event close, and enable/disable.
+        */
+       struct mutex event_mutex;
+       int overflows;
+       struct perf_event *overflow_events[X86_PMC_IDX_MAX];
 };
 
 struct kvm_mem_alias {
@@ -377,6 +390,9 @@ struct kvm_mem_aliases {
        int naliases;
 };
 
+#define KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS     (10)
+#define KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM      
(1<<KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS)
+
 struct kvm_arch {
        struct kvm_mem_aliases *aliases;
 
@@ -415,6 +431,15 @@ struct kvm_arch {
        /* fields used by HYPER-V emulation */
        u64 hv_guest_os_id;
        u64 hv_hypercall;
+
+       /*
+        * fields used by PARAVIRT perf interface:
+        * Used to organize all host perf_events representing guest
+        * perf_event on a specific kvm instance
+        */
+       atomic_t kvm_pv_event_num;
+       spinlock_t shadow_lock;
+       struct list_head *shadow_hash_table;
 };
 
 struct kvm_vm_stat {
@@ -561,6 +586,9 @@ int emulator_write_phys(struct kvm_vcpu 
                          const void *val, int bytes);
 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
                  gpa_t addr, unsigned long *ret);
+int kvm_pv_perf_op(struct kvm_vcpu *vcpu, int op_code, unsigned long a1,
+                  unsigned long a2, unsigned long *result);
+
 u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 extern bool tdp_enabled;
--- linux-2.6_tip0620/include/linux/kvm_para.h  2010-06-21 15:19:53.309999849 
+0800
+++ linux-2.6_tip0620perfkvm/include/linux/kvm_para.h   2010-06-21 
15:21:39.312999849 +0800
@@ -17,6 +17,7 @@
 
 #define KVM_HC_VAPIC_POLL_IRQ          1
 #define KVM_HC_MMU_OP                  2
+#define KVM_PERF_OP                    3
 
 /*
  * hypercalls use architecture specific
--- linux-2.6_tip0620/arch/x86/kvm/vmx.c        2010-06-21 15:19:39.322999849 
+0800
+++ linux-2.6_tip0620perfkvm/arch/x86/kvm/vmx.c 2010-06-21 15:21:39.310999849 
+0800
@@ -3647,6 +3647,7 @@ static int vmx_handle_exit(struct kvm_vc
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exit_reason = vmx->exit_reason;
        u32 vectoring_info = vmx->idt_vectoring_info;
+       int ret;
 
        trace_kvm_exit(exit_reason, vcpu);
 
@@ -3694,12 +3695,17 @@ static int vmx_handle_exit(struct kvm_vc
 
        if (exit_reason < kvm_vmx_max_exit_handlers
            && kvm_vmx_exit_handlers[exit_reason])
-               return kvm_vmx_exit_handlers[exit_reason](vcpu);
+               ret = kvm_vmx_exit_handlers[exit_reason](vcpu);
        else {
                vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
                vcpu->run->hw.hardware_exit_reason = exit_reason;
+               ret = 0;
        }
-       return 0;
+
+       /* sync paravirt perf event to guest */
+       kvm_sync_events_to_guest(vcpu);
+
+       return ret;
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
--- linux-2.6_tip0620/arch/x86/kvm/x86.c        2010-06-21 15:19:39.315999849 
+0800
+++ linux-2.6_tip0620perfkvm/arch/x86/kvm/x86.c 2010-06-21 16:49:58.182999849 
+0800
@@ -6,12 +6,14 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2008 Qumranet, Inc.
  * Copyright IBM Corporation, 2008
+ * Copyright Intel Corporation, 2010
  *
  * Authors:
  *   Avi Kivity   <a...@qumranet.com>
  *   Yaniv Kamay  <ya...@qumranet.com>
  *   Amit Shah    <amit.s...@qumranet.com>
  *   Ben-Ami Yassour <ben...@il.ibm.com>
+ *   Yanmin Zhang <yanmin.zh...@intel.com>
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
@@ -1618,6 +1620,7 @@ int kvm_dev_ioctl_check_extension(long e
        case KVM_CAP_PCI_SEGMENT:
        case KVM_CAP_DEBUGREGS:
        case KVM_CAP_X86_ROBUST_SINGLESTEP:
+       case KVM_CAP_PV_PERF:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -1993,7 +1996,9 @@ static void do_cpuid_ent(struct kvm_cpui
                entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
                             (1 << KVM_FEATURE_NOP_IO_DELAY) |
                             (1 << KVM_FEATURE_CLOCKSOURCE2) |
+                            (1 << KVM_FEATURE_PV_PERF) |
                             (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+
                entry->ebx = 0;
                entry->ecx = 0;
                entry->edx = 0;
@@ -4052,10 +4057,21 @@ static unsigned long kvm_get_guest_ip(vo
        return ip;
 }
 
+int kvm_notify_event_overflow(void)
+{
+       if (percpu_read(current_vcpu)) {
+               kvm_inject_nmi(percpu_read(current_vcpu));
+               return 0;
+       }
+
+       return -1;
+}
+
 static struct perf_guest_info_callbacks kvm_guest_cbs = {
        .is_in_guest            = kvm_is_in_guest,
        .is_user_mode           = kvm_is_user_mode,
        .get_guest_ip           = kvm_get_guest_ip,
+       .copy_event_to_shadow   = kvm_copy_event_to_shadow,
 };
 
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
@@ -4138,15 +4154,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vc
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
-static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
-                          unsigned long a1)
-{
-       if (is_long_mode(vcpu))
-               return a0;
-       else
-               return a0 | ((gpa_t)a1 << 32);
-}
-
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
        u64 param, ingpa, outgpa, ret;
@@ -4245,6 +4252,9 @@ int kvm_emulate_hypercall(struct kvm_vcp
        case KVM_HC_MMU_OP:
                r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
                break;
+       case KVM_PERF_OP:
+               r = kvm_pv_perf_op(vcpu, a0, a1, a2, &ret);
+               break;
        default:
                ret = -KVM_ENOSYS;
                break;
@@ -5334,6 +5344,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *
        }
        vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
+       mutex_init(&vcpu->arch.event_mutex);
+
        return 0;
 fail_free_lapic:
        kvm_free_lapic(vcpu);
@@ -5360,6 +5372,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcp
 struct  kvm *kvm_arch_create_vm(void)
 {
        struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+       struct list_head *hash_table;
+       int i;
 
        if (!kvm)
                return ERR_PTR(-ENOMEM);
@@ -5369,6 +5383,18 @@ struct  kvm *kvm_arch_create_vm(void)
                kfree(kvm);
                return ERR_PTR(-ENOMEM);
        }
+       hash_table = kmalloc(sizeof(struct list_head) *
+                       KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM,
+                       GFP_KERNEL);
+       if (!hash_table) {
+               kfree(kvm->arch.aliases);
+               kfree(kvm);
+               return ERR_PTR(-ENOMEM);
+       }
+       for (i = 0; i < KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM; i++)
+               INIT_LIST_HEAD(&hash_table[i]);
+       kvm->arch.shadow_hash_table = hash_table;
+       spin_lock_init(&kvm->arch.shadow_lock);
 
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -5416,6 +5442,8 @@ void kvm_arch_sync_events(struct kvm *kv
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+       kvm_remove_all_perf_events(kvm);
+
        kvm_iommu_unmap_guest(kvm);
        kvm_free_pit(kvm);
        kfree(kvm->arch.vpic);
@@ -5427,6 +5455,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm
        if (kvm->arch.ept_identity_pagetable)
                put_page(kvm->arch.ept_identity_pagetable);
        cleanup_srcu_struct(&kvm->srcu);
+       kfree(kvm->arch.shadow_hash_table);
        kfree(kvm->arch.aliases);
        kfree(kvm);
 }
--- linux-2.6_tip0620/arch/x86/kvm/x86.h        2010-06-21 15:19:39.311999849 
+0800
+++ linux-2.6_tip0620perfkvm/arch/x86/kvm/x86.h 2010-06-21 15:21:39.312999849 
+0800
@@ -72,7 +72,20 @@ static inline struct kvm_mem_aliases *kv
                        || lockdep_is_held(&kvm->slots_lock));
 }
 
+static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
+                          unsigned long a1)
+{
+       if (is_long_mode(vcpu))
+               return a0;
+       else
+               return a0 | ((gpa_t)a1 << 32);
+}
+
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+int kvm_notify_event_overflow(void);
+void kvm_copy_event_to_shadow(struct perf_event *event, int overflows);
+void kvm_sync_events_to_guest(struct kvm_vcpu *vcpu);
+void kvm_remove_all_perf_events(struct kvm *kvm);
 
 #endif
--- linux-2.6_tip0620/arch/x86/kvm/Makefile     2010-06-21 15:19:39.311999849 
+0800
+++ linux-2.6_tip0620perfkvm/arch/x86/kvm/Makefile      2010-06-21 
15:21:39.310999849 +0800
@@ -11,7 +11,7 @@ kvm-y                 += $(addprefix ../../../virt/kvm
 kvm-$(CONFIG_IOMMU_API)        += $(addprefix ../../../virt/kvm/, iommu.o)
 
 kvm-y                  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
-                          i8254.o timer.o
+                          i8254.o timer.o kvmperf_event.o
 kvm-intel-y            += vmx.o
 kvm-amd-y              += svm.o
 
--- linux-2.6_tip0620/arch/x86/kvm/kvmperf_event.c      1970-01-01 
08:00:00.000000000 +0800
+++ linux-2.6_tip0620perfkvm/arch/x86/kvm/kvmperf_event.c       2010-06-21 
16:49:29.509999849 +0800
@@ -0,0 +1,471 @@
+/*
+ * Performance events x86 kvm para architecture code
+ *
+ * Copyright (C) 2010 Intel Inc.
+ *     Zhang Yanmin <yanmin.zh...@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+#include <asm/compat.h>
+
+#include "x86.h"
+
+#define KVM_MAX_PARAVIRT_PERF_EVENT            (1024)
+
+static inline u32 shadow_hash_id(int id)
+{
+       u32 hash_value = id;
+
+       hash_value = hash_32(hash_value, KVM_PARAVIRT_PERF_EVENT_ENTRY_BITS);
+       return hash_value;
+}
+
+static int kvm_add_host_event(struct kvm_vcpu *vcpu,
+               struct host_perf_shadow *host_shadow)
+{
+       long unsigned flags;
+       u32 index = shadow_hash_id(host_shadow->id);
+       struct kvm_arch *arch = &vcpu->kvm->arch;
+       struct list_head *head = &arch->shadow_hash_table[index];
+       struct list_head *pos;
+       struct host_perf_shadow *tmp;
+
+       spin_lock_irqsave(&arch->shadow_lock, flags);
+       list_for_each(pos, head) {
+               tmp = container_of(pos, struct host_perf_shadow, shadow_entry);
+               WARN(tmp->id == host_shadow->id, "%s called when there is an"
+                       " item with the same id [%d] in hash table,",
+                       __func__, host_shadow->id);
+       }
+       list_add(&host_shadow->shadow_entry, head);
+       spin_unlock_irqrestore(&arch->shadow_lock, flags);
+       return 0;
+}
+
+static struct perf_event *
+kvm_find_get_host_event(struct kvm_vcpu *vcpu, int id, int need_delete)
+{
+       long unsigned flags;
+       u32 index = shadow_hash_id(id);
+       struct kvm_arch *arch = &vcpu->kvm->arch;
+       struct list_head *head = &arch->shadow_hash_table[index];
+       struct list_head *pos;
+       struct host_perf_shadow *tmp = NULL;
+       int found = 0;
+
+       spin_lock_irqsave(&arch->shadow_lock, flags);
+       list_for_each(pos, head) {
+               tmp = container_of(pos, struct host_perf_shadow, shadow_entry);
+               if (tmp->id == id) {
+                       found = 1;
+                       if (need_delete)
+                               list_del_init(&tmp->shadow_entry);
+                       else
+                               atomic_inc(&tmp->ref_counter);
+                       break;
+               }
+       }
+       spin_unlock_irqrestore(&arch->shadow_lock, flags);
+
+       if (found)
+               return tmp->host_event;
+       else
+               return NULL;
+}
+
+static void kvm_vcpu_add_event_overflow_ref(struct perf_event *event)
+{
+       struct host_perf_shadow *host_shadow = event->host_perf_shadow;
+       struct kvm_vcpu *vcpu = host_shadow->vcpu;
+       int ret;
+
+       /*
+        * Use trylock as it's in NMI handler. We don't care
+        * too much to lose reporting once of one event to guets os,
+        * because host saves overflows counter in host_perf_shadow.
+        * Next time when a new overflow of the event happens and if
+        * there is no contention, host could push overflows to guest
+        * and guest could process also saved overflows.
+        */
+       ret = mutex_trylock(&vcpu->arch.event_mutex);
+       if (!ret)
+               return;
+       if (vcpu->arch.overflows < X86_PMC_IDX_MAX) {
+               vcpu->arch.overflow_events[vcpu->arch.overflows] = event;
+               vcpu->arch.overflows++;
+       }
+       mutex_unlock(&vcpu->arch.event_mutex);
+}
+
+static int kvm_vcpu_remove_event_overflow_ref(struct host_perf_shadow *shadow)
+{
+       struct kvm_vcpu *vcpu = shadow->vcpu;
+       int i;
+
+       if (!vcpu || !vcpu->arch.overflows)
+               return -1;
+
+       mutex_lock(&vcpu->arch.event_mutex);
+       for (i = 0; i < vcpu->arch.overflows; i++) {
+               if (vcpu->arch.overflow_events[i] == shadow->host_event)
+                       vcpu->arch.overflow_events[i] = NULL;
+       }
+       mutex_unlock(&vcpu->arch.event_mutex);
+       return 0;
+}
+
+void kvm_copy_event_to_shadow(struct perf_event *event, int overflows)
+{
+       struct host_perf_shadow *shadow = event->host_perf_shadow;
+
+       shadow->counter.count = local64_read(&event->count);
+       atomic_add(overflows, &shadow->counter.overflows);
+       kvm_vcpu_add_event_overflow_ref(event);
+       /* Inject NMI to guest os */
+       kvm_notify_event_overflow();
+}
+
+static void kvm_perf_event_overflow(struct perf_event *event, int nmi,
+               struct perf_sample_data *data, struct pt_regs *regs)
+{
+       BUG_ON(event->host_perf_shadow == NULL);
+       kvm_copy_event_to_shadow(event, 1);
+}
+
+static void kvm_put_host_event(struct perf_event *host_event)
+{
+       struct host_perf_shadow *shadow = host_event->host_perf_shadow;
+       if (!atomic_dec_return(&shadow->ref_counter)) {
+               /*
+                * detach it in case guest os doesn't disables it
+                * before closing
+                */
+               perf_event_detach(host_event);
+               kvm_vcpu_remove_event_overflow_ref(shadow);
+
+               perf_event_release_kernel(host_event);
+               kfree(shadow);
+               atomic_dec(&shadow->vcpu->kvm->arch.kvm_pv_event_num);
+       }
+}
+
+static void kvm_copy_event_to_guest(struct kvm_vcpu *vcpu,
+                       struct perf_event *host_event)
+{
+       struct host_perf_shadow *shadow = host_event->host_perf_shadow;
+       struct guest_perf_event counter;
+       int ret;
+       s32 overflows;
+
+       ret = kvm_read_guest(vcpu->kvm, shadow->guest_event_addr,
+                               &counter, sizeof(counter));
+       if (ret < 0)
+               return;
+
+again:
+       overflows = atomic_read(&shadow->counter.overflows);
+       if (atomic_cmpxchg(&shadow->counter.overflows, overflows, 0) !=
+                       overflows)
+               goto again;
+
+       counter.count = shadow->counter.count;
+       atomic_add(overflows, &counter.overflows);
+
+       kvm_write_guest(vcpu->kvm,
+                       shadow->guest_event_addr,
+                       &counter,
+                       sizeof(counter));
+       return;
+}
+
+/*
+ * called by KVM to copy both perf_event->count and overflows to guest
+ * after host NMI handler detects guest perf_event overflows
+ */
+void kvm_sync_events_to_guest(struct kvm_vcpu *vcpu)
+{
+       int i;
+
+       if (vcpu->arch.overflows == 0)
+               return;
+
+       mutex_lock(&vcpu->arch.event_mutex);
+       for (i = 0; i < vcpu->arch.overflows; i++) {
+               if (vcpu->arch.overflow_events[i]) {
+                       kvm_copy_event_to_guest(vcpu,
+                               vcpu->arch.overflow_events[i]);
+               }
+       }
+       vcpu->arch.overflows = 0;
+       mutex_unlock(&vcpu->arch.event_mutex);
+}
+EXPORT_SYMBOL_GPL(kvm_sync_events_to_guest);
+
+/* Just copy perf_event->count to guest. Don't copy overflows to guest */
+static void
+kvm_copy_count_to_guest(struct kvm_vcpu *vcpu, struct perf_event *host_event)
+{
+       struct host_perf_shadow *shadow = host_event->host_perf_shadow;
+
+       shadow->counter.count = local64_read(&host_event->count);
+       kvm_write_guest(vcpu->kvm,
+                       shadow->guest_event_addr,
+                       &shadow->counter.count,
+                       sizeof(shadow->counter.count));
+       return;
+}
+
+static int
+kvm_pv_perf_op_open(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+       int ret = 0;
+       struct perf_event *host_event = NULL;
+       struct host_perf_shadow *shadow = NULL;
+       struct guest_perf_event_param param;
+       struct guest_perf_attr *guest_attr = NULL;
+       struct perf_event_attr *attr = NULL;
+       int next_count;
+
+       next_count = atomic_read(&vcpu->kvm->arch.kvm_pv_event_num);
+       if (next_count >= KVM_MAX_PARAVIRT_PERF_EVENT) {
+               WARN_ONCE(1, "guest os wants to open more than %d events\n",
+                       KVM_MAX_PARAVIRT_PERF_EVENT);
+               return -ENOENT;
+       }
+       atomic_inc(&vcpu->kvm->arch.kvm_pv_event_num);
+
+       attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+       if (!attr) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       guest_attr = kzalloc(sizeof(*guest_attr), GFP_KERNEL);
+       if (!attr) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ret = kvm_read_guest(vcpu->kvm, addr, &param, sizeof(param));
+       if (ret < 0)
+               goto out;
+
+       host_event = kvm_find_get_host_event(vcpu, param.id, 0);
+       if (host_event) {
+               kvm_put_host_event(host_event);
+               return -EEXIST;
+       }
+
+       ret = kvm_read_guest(vcpu->kvm, param.attr_addr,
+                            guest_attr, sizeof(*guest_attr));
+       if (ret < 0)
+               goto out;
+
+       attr->type = guest_attr->type;
+       attr->config = guest_attr->config;
+       attr->sample_period = guest_attr->sample_period;
+       attr->read_format = guest_attr->read_format;
+       attr->flags = guest_attr->flags;
+       attr->bp_type = guest_attr->bp_type;
+       attr->bp_addr = guest_attr->bp_addr;
+       attr->bp_len = guest_attr->bp_len;
+       /*
+        * By default, we disable the host event. Later on, guets os
+        * triggers a perf_event_attach to enable it
+        */
+       attr->disabled = 1;
+       attr->inherit = 0;
+       attr->enable_on_exec = 0;
+       /*
+        * We don't support exclude mode of user and kernel for guest os,
+        * which mean we always collect both user and kernel for guest os
+        */
+       attr->exclude_user = 0;
+       attr->exclude_kernel = 0;
+
+       shadow = kzalloc(sizeof(*shadow), GFP_KERNEL);
+       if (!shadow) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       shadow->id = param.id;
+       shadow->guest_event_addr = param.guest_event_addr;
+       shadow->vcpu = vcpu;
+       INIT_LIST_HEAD(&shadow->shadow_entry);
+
+       /* We always create a cpu context host perf event */
+       host_event = perf_event_create_kernel_counter(attr, -1,
+                               current->pid, kvm_perf_event_overflow);
+
+       if (IS_ERR(host_event)) {
+               host_event = NULL;
+               ret = -1;
+               goto out;
+       }
+       host_event->host_perf_shadow = shadow;
+       shadow->host_event = host_event;
+       atomic_set(&shadow->ref_counter, 1);
+       kvm_add_host_event(vcpu, shadow);
+
+out:
+       if (!host_event)
+               kfree(shadow);
+
+       kfree(attr);
+       kfree(guest_attr);
+
+       if (ret)
+               atomic_dec(&vcpu->kvm->arch.kvm_pv_event_num);
+
+       return ret;
+}
+
+static int kvm_pv_perf_op_close(struct kvm_vcpu *vcpu, int id)
+{
+       struct perf_event *host_event;
+
+       /* Find and delete the event from the hashtable */
+       host_event = kvm_find_get_host_event(vcpu, id, 1);
+       if (!host_event)
+               return -1;
+       kvm_put_host_event(host_event);
+       return 0;
+}
+
+static int kvm_pv_perf_op_enable(struct kvm_vcpu *vcpu, int id)
+{
+       struct perf_event *event;
+       struct host_perf_shadow *shadow;
+
+       event = kvm_find_get_host_event(vcpu, id, 0);
+       if (!event)
+               return -1;
+
+       shadow = event->host_perf_shadow;
+       if (shadow->vcpu != vcpu) {
+               kvm_vcpu_remove_event_overflow_ref(event->host_perf_shadow);
+               shadow->vcpu = vcpu;
+       }
+
+       perf_event_attach(event);
+       kvm_put_host_event(event);
+
+       return 0;
+}
+
+static int kvm_pv_perf_op_disable(struct kvm_vcpu *vcpu, int id)
+{
+       struct perf_event *host_event = kvm_find_get_host_event(vcpu, id, 0);
+       if (!host_event)
+               return -1;
+       perf_event_detach(host_event);
+       /* We sync count to guest as we delay the guest count update */
+       kvm_copy_count_to_guest(vcpu, host_event);
+       kvm_put_host_event(host_event);
+
+       return 0;
+}
+
+static int kvm_pv_perf_op_read(struct kvm_vcpu *vcpu, int id)
+{
+       u64 enabled, running;
+       struct perf_event *host_event = kvm_find_get_host_event(vcpu, id, 0);
+
+       if (!host_event)
+               return -1;
+       if (host_event->state == PERF_EVENT_STATE_ACTIVE)
+               perf_event_read_value(host_event, &enabled, &running);
+       kvm_copy_count_to_guest(vcpu, host_event);
+       kvm_put_host_event(host_event);
+       return 0;
+}
+
+int kvm_pv_perf_op(struct kvm_vcpu *vcpu, int op_code, unsigned long a1,
+               unsigned long a2, unsigned long *result)
+{
+       unsigned long ret;
+       gpa_t addr;
+       int id;
+
+       switch (op_code) {
+       case KVM_PERF_OP_OPEN:
+               addr = hc_gpa(vcpu, a1, a2);
+               ret = (unsigned long) kvm_pv_perf_op_open(vcpu, addr);
+               break;
+       case KVM_PERF_OP_CLOSE:
+               id = (int) a1;
+               ret = kvm_pv_perf_op_close(vcpu, id);
+               break;
+       case KVM_PERF_OP_ENABLE:
+               id = (int) a1;
+               ret = kvm_pv_perf_op_enable(vcpu, id);
+               break;
+       case KVM_PERF_OP_DISABLE:
+               id = (int) a1;
+               ret = kvm_pv_perf_op_disable(vcpu, id);
+               break;
+       case KVM_PERF_OP_READ:
+               id = (int) a1;
+               ret = kvm_pv_perf_op_read(vcpu, id);
+               break;
+       default:
+               ret = -KVM_ENOSYS;
+       }
+
+       *result = ret;
+       return 0;
+}
+
+void kvm_remove_all_perf_events(struct kvm *kvm)
+{
+       long unsigned flags;
+       struct kvm_arch *arch = &kvm->arch;
+       LIST_HEAD(total_events);
+       struct list_head *head;
+       struct list_head *pos, *next;
+       struct host_perf_shadow *tmp;
+       int i;
+
+       spin_lock_irqsave(&arch->shadow_lock, flags);
+       for (i = 0; i < KVM_PARAVIRT_PERF_EVENT_ENTRY_NUM; i++) {
+               head = &arch->shadow_hash_table[i];
+               list_for_each_safe(pos, next, head) {
+                       tmp = container_of(pos, struct host_perf_shadow,
+                                       shadow_entry);
+                       list_del(&tmp->shadow_entry);
+                       list_add(&tmp->shadow_entry, &total_events);
+               }
+       }
+       spin_unlock_irqrestore(&arch->shadow_lock, flags);
+       head = &total_events;
+       list_for_each_safe(pos, next, head) {
+               tmp = container_of(pos, struct host_perf_shadow, shadow_entry);
+               list_del(&tmp->shadow_entry);
+               kvm_put_host_event(tmp->host_event);
+       }
+
+       return;
+}
+
--- linux-2.6_tip0620/include/linux/kvm.h       2010-06-21 15:19:52.605999849 
+0800
+++ linux-2.6_tip0620perfkvm/include/linux/kvm.h        2010-06-21 
15:21:39.312999849 +0800
@@ -524,6 +524,7 @@ struct kvm_enable_cap {
 #define KVM_CAP_PPC_OSI 52
 #define KVM_CAP_PPC_UNSET_IRQ 53
 #define KVM_CAP_ENABLE_CAP 54
+#define KVM_CAP_PV_PERF 57
 
 #ifdef KVM_CAP_IRQ_ROUTING
 


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V2 3/5] ara virt interface of perf to support kvm guest os statistics collection in guest os

Reply via email to