Carsten Otte wrote:
> From: Carsten Otte <[EMAIL PROTECTED]>
>
> This driver provides access to virtual block devices. It does use its own
> make_request function which passes the bio to a workqueue thread. The
> workqueue
> thread does use the diagnose hypervisor call to call the hosting Linux.
> The hypervisor code in host userspace does use aio_submit to initiate the IO.
> Once the IO is done, the host will use io_getevents and then generate an
> interrupt to the guest. The interrupt handler calls bio_endio.
> This device driver is currently architecture dependent. We intend to move the
> host API to hypercall instead of the diagnose instuction. Please review.
>
> Signed-off-by: Carsten Otte <[EMAIL PROTECTED]>
>
> +struct vdisk_device * vdisk_get_device_by_fd(int fd)
> +{
> + struct device *dev;
> + struct vdev *vdev;
> + struct vdisk_device *vdisk;
> +
> + dev = driver_find_device(&vdisk_driver.driver, NULL, (void*)(long)fd,
> __find_fd);
> + if (!dev)
> + return NULL;
> + vdev = to_vdev(dev);
> + vdisk = (struct vdisk_device *)vdev->drv_private;
> + return vdisk;
> +}
>
Is this the host file descriptor? If so, we want to use something more
abstract (if the host side is in kernel, there will be no fd, or if the
device is implemented using >1 files (or <1 files)).
> +
> +#define VDISK_WRITE 1
> +#define VDISK_READ 0
> +
> +struct vdisk_request {
> + unsigned long buf;
> + unsigned long count;
> +};
> +
> +typedef struct vdisk_request (*vdisk_req_t)[VDISK_NR_REQ];
> +
> +struct vdisk_response {
> + unsigned long intparm;
> + unsigned long count;
> + unsigned long failed;
> +};
> +
> +typedef struct vdisk_response (*vdisk_irq_t)[VDISK_NR_RES];
> +
> +struct vdisk_device {
> + struct list_head head;
> + int blocksize;
> + long size;
> + int read_only;
> + struct gendisk *gd;
> + struct vdev *vdev;
> + spinlock_t lock;
> + struct rw_semaphore pump_sem;
> + int open_count;
> + int vfd;
> + struct vdisk_request (*submit_page)[VDISK_NR_REQ];
>
> + struct workqueue_struct *wq;
> + vdisk_irq_t irq_page;
> + wait_queue_head_t wait;
> +};
> +
> +struct vdisk_work {
> + struct work_struct work;
> + struct bio* bio;
> +};
> +
> +struct vdisk_elem {
> + unsigned int fd;
> + unsigned int command;
> + unsigned long offset;
> + unsigned long buffer;
> + unsigned long nbytes;
>
We'll want scatter/gather here.
> +};
> +
> +struct vdisk_iocb_container {
> + struct iocb iocb;
> + struct bio *bio;
> + struct vdisk_device *dev;
> + int ctx_index;
> + unsigned long context;
> + struct list_head list;
> +};
> +
> +// from aio_abi.h
> +typedef enum io_iocb_cmd {
> + IO_CMD_PREAD = 0,
> + IO_CMD_PWRITE = 1,
> +
> + IO_CMD_FSYNC = 2,
> + IO_CMD_FDSYNC = 3,
> +
> + IO_CMD_POLL = 5,
> + IO_CMD_NOOP = 6,
> +} io_iocb_cmd_t;
>
Our own commands, please. We need READV, WRITEV, and a barrier for
journalling filesystems. FDSYNC should work as a barrier, but is
wasteful. The FSYNC/FDSYNC distinction is meaningless. POLL/NOOP are
irrelevant.
> +static void vdisk_pump_bvecs(struct vdisk_device *dev, int op,
> + loff_t start_offset, int requestno,
> + struct bio* bio, struct bio_vec *(vectors[256]))
> +{
> + int i, rc;
> + loff_t offset = start_offset;
> + int nr_done = 0;
> + long size;
> + long flags=0;
> + DEFINE_WAIT(wait);
> +
> + spin_lock_irqsave(&dev->lock, flags);
> + prepare_to_wait_exclusive(&dev->wait, &wait,
> + TASK_UNINTERRUPTIBLE);
> +
> + while (nr_done < requestno) {
> + memset(dev->submit_page, 0, PAGE_SIZE);
> + for (i=nr_done; i<requestno; i++) {
> + (*dev->submit_page)[i-nr_done].buf =
> + (unsigned
> long)page_address(vectors[i]->bv_page) +
> + vectors[i]->bv_offset;
> + (*dev->submit_page)[i-nr_done].count =
> vectors[i]->bv_len;
> + }
> +
> + rc = diag_vdisk_submit_request(dev->vfd,
> + dev->submit_page,
> + op, offset,
> + requestno-nr_done, bio);
> +
> + if (rc < 0) {
> + // error case
> + size = 0;
> + for (i=0; i<(requestno-nr_done); i++)
> + size += (*dev->submit_page)[i].count;
> + bio_io_error(bio, size);
> + break;
> + }
> +
> + if (rc == requestno - nr_done)
> + // everything was submitted propper
> + break;
> +
> + if (rc) {
> + //request was partly submitted
> + for (i=0; i<rc; i++)
> + offset += (*dev->submit_page)[i].count;
> + nr_done += rc;
> + }
> + // we need to throttle IO, and retry submission later
> + spin_unlock_irqrestore(&dev->lock, flags);
> + io_schedule();
> + spin_lock_irqsave(&dev->lock, flags);
> + }
> + finish_wait(&dev->wait, &wait);
> + spin_unlock_irqrestore(&dev->lock, flags);
> + return;
> +}
>
We want to amortize the hypercall over multiple bios (but maybe you're
doing that -- I'm not 100% up to speed on the block layer)
> +
> +static void vdisk_pump_bio(struct work_struct *zw)
> +{
> + struct vdisk_work *work =
> + container_of(zw, struct vdisk_work, work);
> +
> + struct bio *bio = work->bio;
> + struct bio_vec *bvec;
> + struct bio_vec *(vectors[256]);
> + struct vdisk_device *dev = bio->bi_bdev->bd_disk->private_data;
> + int i, op, requestno=0;
> + loff_t start_offset, offset;
> +
> + BUG_ON(!dev);
> +
> + kfree (zw);
> +
> + if (bio_data_dir(bio))
> + op = VDISK_WRITE;
> + else
> + op = VDISK_READ;
> +
> + offset = start_offset = ((loff_t)bio->bi_sector)<<SECTOR_SHIFT;
> +
> + bio_for_each_segment(bvec, bio, i) {
> + if (bvec->bv_len & (dev->blocksize - 1)) //FIXME: Zugriff auf
> dev ohne lock
> + goto out;
> +
> + vectors[requestno] = bvec;
> + offset += bvec->bv_len;
> + requestno++;
> + if (requestno == 255) {
> + vdisk_pump_bvecs(dev, op, start_offset, requestno,
> + bio, vectors);
> + start_offset = offset;
> + requestno = 0;
> + }
> + }
> +
> + if (requestno)
> + vdisk_pump_bvecs(dev, op, start_offset, requestno, bio,
> vectors);
> +
> +out:
> + return;
> +}
> +
> +static int vdisk_make_request(request_queue_t *q, struct bio *bio)
> +{
> + struct vdisk_device *dev = bio->bi_bdev->bd_disk->private_data;
> + struct vdisk_work *work;
> + int rc;
> +
> + if (!dev) {
> + rc = -ENODEV;
> + goto out;
> + }
> +
> + if (bio_barrier(bio)) {
> + rc = -EOPNOTSUPP;
> + goto out;
> + }
> +
> + work = kmalloc(sizeof(struct vdisk_work), GFP_KERNEL);
> + if (!work) {
> + rc = -ENOMEM;
> + goto out;
> + }
> +
> + work->bio = bio;
> +
> + INIT_WORK(&work->work, vdisk_pump_bio);
> +
> + if (!queue_work(dev->wq, &work->work)) {
> + rc = -EIO;
> + kfree(work);
> + } else
> + rc = 0;
>
Any reason not to perform the work directly?
--
error compiling committee.c: too many arguments to function
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel