On Fri, Jun 18, 2021 at 04:48:14AM +0000, Jing Zhang wrote:
> This commit defines the API for userspace and prepare the common
> functionalities to support per VM/VCPU binary stats data readings.
> 
> The KVM stats now is only accessible by debugfs, which has some
> shortcomings this change series are supposed to fix:
> 1. The current debugfs stats solution in KVM could be disabled
>    when kernel Lockdown mode is enabled, which is a potential
>    rick for production.
> 2. The current debugfs stats solution in KVM is organized as "one
>    stats per file", it is good for debugging, but not efficient
>    for production.
> 3. The stats read/clear in current debugfs solution in KVM are
>    protected by the global kvm_lock.
> 
> Besides that, there are some other benefits with this change:
> 1. All KVM VM/VCPU stats can be read out in a bulk by one copy
>    to userspace.
> 2. A schema is used to describe KVM statistics. From userspace's
>    perspective, the KVM statistics are self-describing.
> 3. With the fd-based solution, a separate telemetry would be able
>    to read KVM stats in a less privileged environment.
> 4. After the initial setup by reading in stats descriptors, a
>    telemetry only needs to read the stats data itself, no more
>    parsing or setup is needed.
> 
> Reviewed-by: David Matlack <dmatl...@google.com>
> Reviewed-by: Ricardo Koller <ricar...@google.com>
> Reviewed-by: Krish Sadhukhan <krish.sadhuk...@oracle.com>
> Reviewed-by: Fuad Tabba <ta...@google.com>
> Tested-by: Fuad Tabba <ta...@google.com> #arm64
> Signed-off-by: Jing Zhang <jingzhan...@google.com>
> ---
>  arch/arm64/kvm/Makefile   |   2 +-
>  arch/mips/kvm/Makefile    |   2 +-
>  arch/powerpc/kvm/Makefile |   2 +-
>  arch/s390/kvm/Makefile    |   3 +-
>  arch/x86/kvm/Makefile     |   2 +-
>  include/linux/kvm_host.h  | 145 ++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h  |  42 +++++++++++
>  virt/kvm/binary_stats.c   | 130 ++++++++++++++++++++++++++++++++++
>  8 files changed, 323 insertions(+), 5 deletions(-)
>  create mode 100644 virt/kvm/binary_stats.c
> 
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index 589921392cb1..989bb5dad2c8 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -11,7 +11,7 @@ obj-$(CONFIG_KVM) += kvm.o
>  obj-$(CONFIG_KVM) += hyp/
>  
>  kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
> -      $(KVM)/vfio.o $(KVM)/irqchip.o \
> +      $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
>        arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
>        inject_fault.o va_layout.o handle_exit.o \
>        guest.o debug.o reset.o sys_regs.o \
> diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
> index 30cc060857c7..c67250a956b8 100644
> --- a/arch/mips/kvm/Makefile
> +++ b/arch/mips/kvm/Makefile
> @@ -2,7 +2,7 @@
>  # Makefile for KVM support for MIPS
>  #
>  
> -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o 
> eventfd.o)
> +common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o 
> eventfd.o binary_stats.o)
>  
>  EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
>  
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index 2bfeaa13befb..b347d043b932 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -6,7 +6,7 @@
>  ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
>  KVM := ../../../virt/kvm
>  
> -common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
> +common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o
>  common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
>  common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
>  
> diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
> index 12decca22e7c..b3aaadc60ead 100644
> --- a/arch/s390/kvm/Makefile
> +++ b/arch/s390/kvm/Makefile
> @@ -4,7 +4,8 @@
>  # Copyright IBM Corp. 2008
>  
>  KVM := ../../../virt/kvm
> -common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o 
> $(KVM)/irqchip.o $(KVM)/vfio.o
> +common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o \
> +           $(KVM)/irqchip.o $(KVM)/vfio.o $(KVM)/binary_stats.o
>  
>  ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
>  
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index 83331376b779..75dfd27b6e8a 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -11,7 +11,7 @@ KVM := ../../../virt/kvm
>  
>  kvm-y                        += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
>                               $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o 
> \
> -                             $(KVM)/dirty_ring.o
> +                             $(KVM)/dirty_ring.o $(KVM)/binary_stats.o
>  kvm-$(CONFIG_KVM_ASYNC_PF)   += $(KVM)/async_pf.o
>  
>  kvm-y                        += x86.o emulate.o i8259.o irq.o lapic.o \
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 5a31e0696360..2f0d12064ae7 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1272,6 +1272,12 @@ struct kvm_stats_debugfs_item {
>       int mode;
>  };
>  
> +#define KVM_STATS_NAME_LEN   48
> +struct _kvm_stats_desc {
> +     struct kvm_stats_desc desc;
> +     char name[KVM_STATS_NAME_LEN];
> +};
> +
>  #define KVM_DBGFS_GET_MODE(dbgfs_item)                                       
>   \
>       ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644)
>  
> @@ -1285,8 +1291,147 @@ struct kvm_stats_debugfs_item {
>       { n, offsetof(struct kvm_vcpu, stat.generic.x),                        \
>         KVM_STAT_VCPU, ## __VA_ARGS__ }
>  
> +#define STATS_DESC_COMMON(type, unit, base, exp)                            \
> +     .flags = type | unit | base |                                          \
> +         BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) |                   \
> +         BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) |                   \
> +         BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK),                    \
> +     .exponent = exp,                                                       \
> +     .size = 1
> +
> +#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp)                  \
> +     {                                                                      \
> +             {                                                              \
> +                     STATS_DESC_COMMON(type, unit, base, exp),              \
> +                     .offset = offsetof(struct kvm_vm_stat, generic.stat)   \
> +             },                                                             \
> +             .name = #stat,                                                 \
> +     }
> +#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp)                \
> +     {                                                                      \
> +             {                                                              \
> +                     STATS_DESC_COMMON(type, unit, base, exp),              \
> +                     .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \
> +             },                                                             \
> +             .name = #stat,                                                 \
> +     }
> +#define VM_STATS_DESC(stat, type, unit, base, exp)                          \
> +     {                                                                      \
> +             {                                                              \
> +                     STATS_DESC_COMMON(type, unit, base, exp),              \
> +                     .offset = offsetof(struct kvm_vm_stat, stat)           \
> +             },                                                             \
> +             .name = #stat,                                                 \
> +     }
> +#define VCPU_STATS_DESC(stat, type, unit, base, exp)                        \
> +     {                                                                      \
> +             {                                                              \
> +                     STATS_DESC_COMMON(type, unit, base, exp),              \
> +                     .offset = offsetof(struct kvm_vcpu_stat, stat)         \
> +             },                                                             \
> +             .name = #stat,                                                 \
> +     }
> +/* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */
> +#define STATS_DESC(SCOPE, stat, type, unit, base, exp)                       
>        \
> +     SCOPE##_STATS_DESC(stat, type, unit, base, exp)
> +
> +#define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent)            \
> +     STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE,                     \
> +                   unit, base, exponent)
> +#define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent)                
>        \
> +     STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent)  \
> +
> +/* Cumulative counter */
> +#define STATS_DESC_COUNTER(SCOPE, name)                                      
>        \
> +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_NONE,                \
> +             KVM_STATS_BASE_POW10, 0)
> +/* Instantaneous counter */
> +#define STATS_DESC_ICOUNTER(SCOPE, name)                                    \
> +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_NONE,                   \
> +             KVM_STATS_BASE_POW10, 0)
> +
> +/* Cumulative clock cycles */
> +#define STATS_DESC_CYCLE(SCOPE, name)                                        
>        \
> +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_CYCLES,              \
> +             KVM_STATS_BASE_POW10, 0)
> +/* Instantaneous clock cycles */
> +#define STATS_DESC_ICYCLE(SCOPE, name)                                       
>        \
> +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_CYCLES,                 \
> +             KVM_STATS_BASE_POW10, 0)
> +
> +/* Cumulative memory size in Byte */
> +#define STATS_DESC_SIZE_BYTE(SCOPE, name)                                   \
> +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES,               \
> +             KVM_STATS_BASE_POW2, 0)
> +/* Cumulative memory size in KiByte */
> +#define STATS_DESC_SIZE_KBYTE(SCOPE, name)                                  \
> +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES,               \
> +             KVM_STATS_BASE_POW2, 10)
> +/* Cumulative memory size in MiByte */
> +#define STATS_DESC_SIZE_MBYTE(SCOPE, name)                                  \
> +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES,               \
> +             KVM_STATS_BASE_POW2, 20)
> +/* Cumulative memory size in GiByte */
> +#define STATS_DESC_SIZE_GBYTE(SCOPE, name)                                  \
> +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES,               \
> +             KVM_STATS_BASE_POW2, 30)
> +
> +/* Instantaneous memory size in Byte */
> +#define STATS_DESC_ISIZE_BYTE(SCOPE, name)                                  \
> +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES,                  \
> +             KVM_STATS_BASE_POW2, 0)
> +/* Instantaneous memory size in KiByte */
> +#define STATS_DESC_ISIZE_KBYTE(SCOPE, name)                                 \
> +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES,                  \
> +             KVM_STATS_BASE_POW2, 10)
> +/* Instantaneous memory size in MiByte */
> +#define STATS_DESC_ISIZE_MBYTE(SCOPE, name)                                 \
> +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES,                  \
> +             KVM_STATS_BASE_POW2, 20)
> +/* Instantaneous memory size in GiByte */
> +#define STATS_DESC_ISIZE_GBYTE(SCOPE, name)                                 \
> +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES,                  \
> +             KVM_STATS_BASE_POW2, 30)
> +
> +/* Cumulative time in second */
> +#define STATS_DESC_TIME_SEC(SCOPE, name)                                    \
> +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,             \
> +             KVM_STATS_BASE_POW10, 0)
> +/* Cumulative time in millisecond */
> +#define STATS_DESC_TIME_MSEC(SCOPE, name)                                   \
> +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,             \
> +             KVM_STATS_BASE_POW10, -3)
> +/* Cumulative time in microsecond */
> +#define STATS_DESC_TIME_USEC(SCOPE, name)                                   \
> +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,             \
> +             KVM_STATS_BASE_POW10, -6)
> +/* Cumulative time in nanosecond */
> +#define STATS_DESC_TIME_NSEC(SCOPE, name)                                   \
> +     STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,             \
> +             KVM_STATS_BASE_POW10, -9)
> +
> +/* Instantaneous time in second */
> +#define STATS_DESC_ITIME_SEC(SCOPE, name)                                   \
> +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS,                \
> +             KVM_STATS_BASE_POW10, 0)
> +/* Instantaneous time in millisecond */
> +#define STATS_DESC_ITIME_MSEC(SCOPE, name)                                  \
> +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS,                \
> +             KVM_STATS_BASE_POW10, -3)
> +/* Instantaneous time in microsecond */
> +#define STATS_DESC_ITIME_USEC(SCOPE, name)                                  \
> +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS,                \
> +             KVM_STATS_BASE_POW10, -6)
> +/* Instantaneous time in nanosecond */
> +#define STATS_DESC_ITIME_NSEC(SCOPE, name)                                  \
> +     STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS,                \
> +             KVM_STATS_BASE_POW10, -9)
> +
>  extern struct kvm_stats_debugfs_item debugfs_entries[];
>  extern struct dentry *kvm_debugfs_dir;
> +ssize_t kvm_stats_read(char *id, struct kvm_stats_header *header,
> +             struct _kvm_stats_desc *desc, void *stats, size_t size_stats,
> +             char __user *user_buffer, size_t size, loff_t *offset);
>  
>  #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
>  static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 9febe1412f7a..ab73e905105c 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1086,6 +1086,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_HYPERV_ENFORCE_CPUID 199
>  #define KVM_CAP_SREGS2 200
>  #define KVM_CAP_EXIT_HYPERCALL 201
> +#define KVM_CAP_BINARY_STATS_FD 202
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1905,4 +1906,45 @@ struct kvm_dirty_gfn {
>  #define KVM_BUS_LOCK_DETECTION_OFF             (1 << 0)
>  #define KVM_BUS_LOCK_DETECTION_EXIT            (1 << 1)
>  
> +#define KVM_STATS_ID_MAXLEN          64
> +
> +struct kvm_stats_header {
> +     __u32 name_size;
> +     __u32 count;
> +     __u32 desc_offset;
> +     __u32 data_offset;
> +     char id[];
> +};

You mentioned before that the size of this really is the size of the
structure + KVM_STATS_ID_MAXLEN, right?  Or is it - KVM_STATS_ID_MAXLEN?

If so, why not put that value explicitly in:
        char id[THE_REST_OF_THE_HEADER_SPACE];

As this is not a variable header size at all, and you can not change it
going forward, so the variable length array here feels disingenuous.



> +
> +#define KVM_STATS_TYPE_SHIFT         0
> +#define KVM_STATS_TYPE_MASK          (0xF << KVM_STATS_TYPE_SHIFT)
> +#define KVM_STATS_TYPE_CUMULATIVE    (0x0 << KVM_STATS_TYPE_SHIFT)
> +#define KVM_STATS_TYPE_INSTANT               (0x1 << KVM_STATS_TYPE_SHIFT)
> +#define KVM_STATS_TYPE_MAX           KVM_STATS_TYPE_INSTANT
> +
> +#define KVM_STATS_UNIT_SHIFT         4
> +#define KVM_STATS_UNIT_MASK          (0xF << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_NONE          (0x0 << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_BYTES         (0x1 << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_SECONDS               (0x2 << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_CYCLES                (0x3 << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_MAX           KVM_STATS_UNIT_CYCLES
> +
> +#define KVM_STATS_BASE_SHIFT         8
> +#define KVM_STATS_BASE_MASK          (0xF << KVM_STATS_BASE_SHIFT)
> +#define KVM_STATS_BASE_POW10         (0x0 << KVM_STATS_BASE_SHIFT)
> +#define KVM_STATS_BASE_POW2          (0x1 << KVM_STATS_BASE_SHIFT)
> +#define KVM_STATS_BASE_MAX           KVM_STATS_BASE_POW2
> +
> +struct kvm_stats_desc {
> +     __u32 flags;
> +     __s16 exponent;
> +     __u16 size;
> +     __u32 offset;
> +     __u32 unused;
> +     char name[];
> +};

What is the max length of name?

Why aren't these structures defined here in kerneldoc so that we can
understand them better?  Putting them in a .rst file guarantees they
will get out of sync, and you can always directly import the kerneldoc
into the .rst file.

thanks,

greg k-h
_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

Reply via email to