I had originally hoped to get this in for 2.6.20.  It now looks like .20 
will have a shorter cycle than usual, and the mmu took a bit longer than 
expected, so it's more realistic to aim for 2.6.21.

The current kvm userspace interface has several deficiencies:

- open("/dev/kvm") returns a different object (a new vm) per invocation; 
this is "unusual" by Linux standards
- all vcpus share the same inode and struct file, which can cause 
scalability problems on very large smps.  This isn't a problem for 
current hardware, which has moderate core counts and huge vmexit 
latencies, not to mention a limit of one vcpu per vm, but I'd like to 
future-proof the interface.
- the KVM_VCPU_RUN ioctl() copies a needless chuck of data back and forth
- the PIO handlers communicate by means of registers (for single I/O) or 
virtual addresses (for string I/O).  Instead the values should be 
explicit fields in some structure, and physical addresses should be used 
to remove the need to translate addresses in userspace.
- the interrupt code still needs work to properly support the local apic 
with Windows guests.
- userspace must rely on delivered signals, which are slow, and cannot 
use queued signals (a la pselect()/ppoll()).

I propose the following as the new, stable, kvm api:

// open a handle to the kvm interface.  does not create a vm.
int kvm_fd = open("/dev/kvm", O_RDWR);

// the kvm interface supports just three ioctls:
ioctl(kvm_fd, KVM_GET_API_VERSION, 0);
ioctl(kvm_fd, KVM_GET_MSR_LIST, &msr_list);
int vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, 0);

// vm ioctls:
ioctl(vm_fd, KVM_VM_CREATE_MEMORY_REGION, &slot);
ioctl(vm_fd, KVM_VM_GET_DIRTY_LOG, &dirty_log);
int vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, vcpu_slot_number);

// each vcpu is a separate fd/inode.  this ensures no cacheline bouncing
// when the kernel refcounts the inodes on syscalls.

// kvm_vcpu_area contains the exit reasons and associated data, and
// results returned by userspace to resolve the exit reasons.
struct kvm_vcpu_area *vcpu_area = mmap(NULL, PAGE_SIZE, ..., vcpu_fd, 0);

struct kvm_vcpu_area {
    u32 vcpu_area_size;
    u32 exit_reason;

    sigset_t sigmask;  // for use during vcpu execution

    union {
        struct kvm_pio pio;
        struct kvm_mmio mmio;
        struct kvm_cpuid cpuid;
        // etc.
        char padding[...];
    };

    struct kvm_irq irq; // acks from vm; injection from userspace
};


// vcpu ioctls

ioctl(vcpu_fd, KVM_VCPU_RUN, 0); // all comms through mmap()ed  vcpu_area
ioctl(vcpu_fd, KVM_VCPU_GET_REGS, &regs);
ioctl(vcpu_fd, KVM_VCPU_SET_REGS, &regs);
ioctl(vcpu_fd, KVM_VCPU_GET_SREGS, &sregs);
ioctl(vcpu_fd, KVM_VCPU_SET_SREGS, &sregs);
ioctl(vcpu_fd, KVM_VCPU_GET_MSRS, &msrs);
ioctl(vcpu_fd, KVM_VCPU_SET_MSRS, &msrs);
ioctl(vcpu_fd, KVM_VCPU_DEBUG_GUEST, &debug);


/* for KVM_VM_CREATE_MEMORY_REGION */
struct kvm_memory_region {
        __u32 slot;
        __u32 flags;
        __u64 guest_phys_addr;
        __u64 memory_size; /* bytes */
};

/* for kvm_memory_region::flags */
#define KVM_MEM_LOG_DIRTY_PAGES  1UL


#define KVM_EXIT_TYPE_FAIL_ENTRY 1
#define KVM_EXIT_TYPE_VM_EXIT    2

enum kvm_exit_reason {
        KVM_EXIT_UNKNOWN          = 0,
        KVM_EXIT_EXCEPTION        = 1,
        KVM_EXIT_IO               = 2,
        KVM_EXIT_CPUID            = 3,
        KVM_EXIT_DEBUG            = 4,
        KVM_EXIT_HLT              = 5,
        KVM_EXIT_MMIO             = 6,
        KVM_EXIT_IRQ_WINDOW_OPEN  = 7,
        KVM_EXIT_HYPERCALL        = 8,
};


/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
        // note: no vcpu!

        /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
        __u64 rax, rbx, rcx, rdx;
        __u64 rsi, rdi, rsp, rbp;
        __u64 r8,  r9,  r10, r11;
        __u64 r12, r13, r14, r15;
        __u64 rip, rflags;
};

struct kvm_segment {
        __u64 base;
        __u32 limit;
        __u16 selector;
        __u8  type;
        __u8  present, dpl, db, s, l, g, avl;
        __u8  unusable;
        __u8  padding;
};

struct kvm_dtable {
        __u64 base;
        __u16 limit;
        __u16 padding[3];
};

/* for KVM_VCPU_GET_SREGS and KVM_VCPU_SET_SREGS */
struct kvm_sregs {
        /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
        struct kvm_segment cs, ds, es, fs, gs, ss;
        struct kvm_segment tr, ldt;
        struct kvm_dtable gdt, idt;
        __u64 cr0, cr2, cr3, cr4, cr8;
};

struct kvm_msr_entry {
        __u32 index;
        __u32 reserved;
        __u64 data;
};

/* for KVM_VCPU_GET_MSRS and KVM_VCPU_SET_MSRS */
struct kvm_msrs {
        __u32 nmsrs; /* number of msrs in entries */
        __u32 padding;

        struct kvm_msr_entry entries[0];
};

/* for KVM_GET_MSR_INDEX_LIST */
struct kvm_msr_list {
        __u32 nmsrs; /* number of msrs in entries */
        __u32 indices[0];
};

struct kvm_breakpoint {
        __u32 enabled;
        __u32 padding;
        __u64 address;
};

/* for KVM_VCPU_DEBUG_GUEST */
struct kvm_debug_guest {
        __u32 enabled;
        __u32 singlestep;
        struct kvm_breakpoint breakpoints[4];
};

/* for KVM_VM_GET_DIRTY_LOG */
struct kvm_dirty_log {
        __u32 slot;
        __u32 padding;
        union {
                void __user *dirty_bitmap; /* one bit per page */
                __u64 padding;
        };
};


Comments and questions are welcome.


Thanks to Arnd Bergmann for his contributions and advice on this issue.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to