2] s390 virtualization interface.

Arnd Bergmann Fri, 27 Apr 2007 09:53:57 -0700

On Friday 27 April 2007, Carsten Otte wrote:
> Add interface which allows a process to start a virtual machine.
 
Looks pretty good to me already.


It seems a lot closer to lguest than to kvm at the moment concerning
the kernel interfaces (guest real address == host process, state
is attached to the thread_info, ...).

Now for the nitpicking:

> +static DEFINE_MUTEX(s390host_init_mutex);
> +
> +static void s390host_get_data(struct s390host_data *data)
> +{
> +     atomic_inc(&data->count);
> +}
> +
> +void s390host_put_data(struct s390host_data *data)
> +{
> +     int cpu;
> +
> +     if (atomic_dec_return(&data->count))
> +             return;
> +
> +     for (cpu = 0; cpu < S390HOST_MAX_CPUS; cpu++)
> +             if (data->sie_io[cpu])
> +                     free_page((unsigned long)data->sie_io[cpu]);
> +
> +     if (data->sca_block)
> +             free_page((unsigned long)data->sca_block);
> +
> +     kfree(data);
> +}

These should probably use a struct kref instead of an atomic_t
to count the references.

> +static unsigned long
> +s390host_create_io_area(unsigned long addr, unsigned long flags,
> +                     unsigned long io_addr, struct s390host_data *data)
> +{
> +     struct mm_struct *mm = current->mm;
> +     struct vm_area_struct *vma;
> +     unsigned long ret;
> +
> +     flags &= MAP_FIXED;
> +     addr = get_unmapped_area(NULL, addr, 2 * PAGE_SIZE, 0, flags);
> +
> +     if (addr & ~PAGE_MASK)
> +             return addr;
> +
> +     vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
> +
> +     if (!vma)
> +             return -ENOMEM;
> +
> +     vma->vm_mm = mm;
> +     vma->vm_start = addr;
> +     vma->vm_end = addr + 2 * PAGE_SIZE;
> +     vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
> +     vma->vm_flags |= VM_SHARED | VM_MAYSHARE | VM_DONTCOPY;
> +
> +#if 1        /* FIXME: write access until sys_s390host_sie interface is 
> final */
> +     vma->vm_flags |= VM_WRITE | VM_MAYWRITE;
> +#endif
> +
> +     vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
> +     vma->vm_private_data = data;
> +     vma->vm_ops = &s390host_vmops;
> +
> +     down_write(&mm->mmap_sem);
> +     ret = insert_vm_struct(mm, vma);
> +     if (ret) {
> +             kmem_cache_free(vm_area_cachep, vma);
> +             goto out;
> +     }
> +     s390host_get_data(data);
> +     mm->total_vm += 2;
> +     vm_insert_page(vma, addr, virt_to_page(io_addr));
> +
> +     ret = split_vma(mm, vma, addr + PAGE_SIZE, 0);
> +     if (ret)
> +             goto out;
> +     s390host_get_data(data);
> +
> +     vma = find_vma(mm, addr + PAGE_SIZE);
> +     vma->vm_flags |= VM_WRITE | VM_MAYWRITE;
> +     vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
> +     vm_insert_page(vma, addr + PAGE_SIZE,
> +                    virt_to_page(io_addr + PAGE_SIZE));
> +     ret = addr;
> +out:
> +     up_write(&mm->mmap_sem);
> +     return ret;
> +}

This open-coded mmap looks a bit scary. I can't see anything wrong in it,
but it may conflict with a user application that tries to do other strange
things with memory maps.

Maybe you can either make this a real call to mmap(), or have it automatically
mapped using the vdso mechanism.

> +long sys_s390host_add_cpu(unsigned long addr, unsigned long flags,
> +                       struct sie_block __user *sie_template)
> +{
> +     struct sie_block *sie_block;
> +     struct sie_io *sie_io;
> +     struct sca_block *sca_block;
> +     struct s390host_data *data = NULL;
> +     unsigned long ret;
> +     __u16 cpu;

__u16 is a type for user space interfaces. In the kernel, use u16.

> +     if (current_thread_info()->sie_cpu != -1)
> +             return -EINVAL;

-EBUSY?

> +     if (copy_from_user(&cpu, &sie_template->icpua, sizeof(u16)))
> +             return -EFAULT;

get_user()?

> +     if (cpu >= S390HOST_MAX_CPUS)
> +             return -EINVAL;
> +
> +     mutex_lock(&s390host_init_mutex);
> +
> +     data = get_s390host_context();
> +     if (!data) {
> +             ret = -ENOMEM;
> +             goto out_err;
> +     }
> +
> +     sca_block = data->sca_block;
> +     if (sca_block->mcn & (1UL << (S390HOST_MAX_CPUS - 1 - cpu))) {

__test_bit()?

> +             ret = -EINVAL;

-EBUSY?

> +int sys_s390host_remove_cpu(void)
> +{
> +     struct sca_block *sca_block;
> +     int cpu;
> +
> +     cpu = current_thread_info()->sie_cpu;
> +     if (cpu == -1)
> +             return -EINVAL;
> +
> +     mutex_lock(&s390host_init_mutex);
> +     sca_block = current_thread_info()->s390host_data->sca_block;
> +     sca_block->mcn &= ~(1UL << (S390HOST_MAX_CPUS - 1 - cpu));
> +     current_thread_info()->sie_cpu = -1;
> +     mutex_unlock(&s390host_init_mutex);
> +     return 0;
> +}

Shouldn't this free the resources that were allocated by add_cpu?
If not, this syscall seems pretty useless and I'd simply not
do it at all -- just wait until exit() to clean up.

> +     sie_io = current_thread_info()->s390host_data->sie_io[cpu];
> +
> +     if (action)
> +             ret = s390host_do_action(action, sie_io);
> +     if (ret)
> +             goto out_err;
> +     sie_kernel = &sie_io->sie_kernel;
> +     sie_user = &sie_io->sie_user;
> +
> +     save_fp_regs(&sie_kernel->host_fpregs);
> +     save_access_regs(sie_kernel->host_acrs);
> +     sie_user->guest_fpregs.fpc &= FPC_VALID_MASK;
> +     restore_fp_regs(&sie_user->guest_fpregs);
> +     restore_access_regs(sie_user->guest_acrs);
> +     memcpy(&sie_kernel->sie_block.gg14, &sie_user->gprs[14], 16);
> +again:
> +     if (need_resched())
> +             schedule();
> +
> +     sie_kernel->sie_block.icptcode = 0;
> +     ret = sie64a(sie_kernel);
> +     if (ret)
> +             goto out;
> +
> +     if (signal_pending(current)) {
> +             ret = -EINTR;
> +             goto out;
> +     }

Can't you handle interrupted system calls? I think it would be nicer
to return -ERESTARTSYS here so you don't have to go back to your
user space after a signal.

> +struct sie_kernel {
> +     struct sie_block        sie_block;
> +     s390_fp_regs    host_fpregs;
> +     int             host_acrs[NUM_ACRS];
> +} __attribute__((packed,aligned(4096)));

Why packed?

> +#define SIE_UPDATE_PSW               (1UL << 0)
> +#define SIE_FLUSH_TLB                (1UL << 1)
> +#define SIE_ISKE             (1UL << 2)
> +#define SIE_SSKE             (1UL << 3)
> +#define SIE_BLOCK_UPDATE     (1UL << 4)
> +#define SIE_VSMXM_LOCAL_UPDATE       (1UL << 5)
> +#define SIE_VSMXM_DIST_UPDATE   (1UL << 6)

I find it much more readable to define these like

#define SIE_UPDATE_PSW          1
#define SIE_FLUSH_TLB           2
#define SIE_ISKE                4

especially when trying to interpret values in memory.

> +struct sie_user {
> +     struct sie_block        sie_block;
> +     psw_t                   psw;
> +     unsigned long           gprs[NUM_GPRS];
> +     s390_fp_regs            guest_fpregs;
> +     int                     guest_acrs[NUM_ACRS];
> +     struct sie_skey_parm    iske_parm;
> +     struct sie_skey_parm    sske_parm;
> +     int                     vsmxm_or_local;
> +     int                     vsmxm_and_local;
> +     int                     vsmxm_or;
> +     int                     vsmxm_and;
> +     int                     vsmxm_cpuid;
> +} __attribute__((packed,aligned(4096)));

Is this data structure extensible? If it is, you probably need
some sort of versioning information to make sure that user space
doesn't rely on fields that the kernel doesn't know about.

> +struct sca_entry {
> +     atomic_t scn;
> +     __u64   reserved;
> +     __u64   sda;
> +     __u64   reserved2[2];
> +}__attribute__((packed));

Is this a hardware data structure? If not, you shouldn't pack it
because the first word is only 32 bits and consequently all following
ones are unaligned.

> +/* function definitions */
> +extern int sie64a(struct sie_kernel *);
> +extern void s390host_put_data(struct s390host_data *);

These should be inside of '#ifdef __KERNEL__', like all declarations that user
space doesn't need to see.

> @@ -52,6 +53,8 @@ struct thread_info {
>       unsigned int            cpu;            /* current CPU */
>       int                     preempt_count;  /* 0 => preemptable, <0 => BUG 
> */
>       struct restart_block    restart_block;
> +     struct s390host_data    *s390host_data; /* s390host data */
> +     int                     sie_cpu;        /* sie cpu number */
>  };

What happens to these variables on execve?

        Arnd <><

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Re: [kvm-devel] [PATCH/RFC 2/2] s390 virtualization interface.

Reply via email to