This is another proposed patch for block devices to have them use mmap
instead of file io.
I use this several ways:
1. regression testing. I can boot a guest, have it mess around,
literally rm -rf all of / with no harm done
2. lots of guests. This saves overhead as lots of mmap'ed pages are shared
3. lockups. I'm getting a deadlock or lockup on when I have lots of
guests doing block io; I want to see if this fixes it.
So, comments on this welcome, and if somebody wants to do it better, I
have no problems with that.
ron
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 3be8ab2..7ac7ca8 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -42,7 +42,6 @@
#include "linux/virtio_blk.h"
#include "linux/virtio_console.h"
#include "linux/virtio_ring.h"
-#include "asm-x86/bootparam.h"
/*L:110 We can ignore the 39 include files we need for this program, but I do
* want to draw attention to the use of kernel-style types.
*
@@ -55,6 +54,7 @@ typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
/*:*/
+#include "asm-x86/bootparam.h"
#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
#define NET_PEERNUM 1
@@ -1405,38 +1405,102 @@ struct vblk_info
/* IO thread writes to this file descriptor to mark it done, then
* Launcher triggers interrupt to Guest. */
int done_fd;
+
+ /* If the file is mmap'ed, this is the pointer to the data */
+ void *map;
};
/*L:210
- * The Disk
+ * Mmap File IO
*
- * Remember that the block device is handled by a separate I/O thread. We head
- * straight into the core of that thread here:
+ * This block device uses a private mmap'ed area for the block device.
+ * Very useful when you're running lots of guests, or don't want guest
+ * changes making their way back to the underlying file system.
*/
-static bool service_io(struct device *dev)
+
+static int
+mmapfileio(struct vblk_info *vblk, struct virtio_blk_outhdr *out, struct iovec *iov, int out_num, int in_num, u8 *in)
{
- struct vblk_info *vblk = dev->priv;
- unsigned int head, out_num, in_num, wlen;
- int ret;
- u8 *in;
- struct virtio_blk_outhdr *out;
- struct iovec iov[dev->vq->vring.num];
+ unsigned int wlen = sizeof(*in);
off64_t off;
+ u8 *data = vblk->map;
+ int i;
+ ssize_t iolen;
+ int iswrite = out->type & VIRTIO_BLK_T_OUT;
- /* See if there's a request waiting. If not, nothing to do. */
- head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
- if (head == dev->vq->vring.num)
- return false;
- /* Every block request should contain at least one output buffer
- * (detailing the location on disk and the type of request) and one
- * input buffer (to hold the result). */
- if (out_num == 0 || in_num == 0)
- errx(1, "Bad virtblk cmd %u out=%u in=%u",
- head, out_num, in_num);
+ off = out->sector * 512;
+
+ /* We first check that the read or write is within limits */
+ if (off >= vblk->len)
+ err(1, "Bad offset %llu vs %llu", off, vblk->len);
+
+ data += off;
+
+ verbose("mmapfileio: %s at offset %llu(%p), ", iswrite ? "WRITE" : "READ",
+ off, data);
+
+ /* if we do no IO whatsoever we'll call it an error */
+ *in = VIRTIO_BLK_S_IOERR;
+
+ if (iswrite) {
+ for(i = 1; i < out_num; i++){
+ iolen = iov[i].iov_len;
+ /* don't even THINK about writing past the end */
+ if ((off + iolen) >= vblk->len)
+ errx(1, "Write past end of private block %llu + %u\n",
+ off, iolen);
+ verbose("%p, %p, %d, ", data, iov[i].iov_base, iolen);
+ memcpy(data, iov[i].iov_base, iolen);
+ off += iolen;
+ data += iolen;
+ wlen += iolen;
+ /* If you were out of range, you're dead before you get here */
+ *in = VIRTIO_BLK_S_OK;
+ }
+ } else {
+ for(i = 1; i < in_num; i++){
+ iolen = iov[i].iov_len;
+ /* read past end?
+ * Just trim the read iolen and we're done
+ */
+ if ((off + iolen) >= vblk->len){
+ warn("Read past end of private block %llu + %u\n", off, iolen);
+ iolen = vblk->len - off;
+ }
+ if (iolen <= 0)
+ break;
+ verbose("%p, %p, %d, ", iov[i].iov_base, data, iolen);
+ memcpy(iov[i].iov_base, data, iolen);
+ off += iolen;
+ data += iolen;
+ wlen += iolen;
+ /* if we do even one IO, we're happy */
+ *in = VIRTIO_BLK_S_OK;
+ }
+ }
+
+ verbose("mmapfileio returns %d\n", wlen);
+ return wlen;
+
+}
+/*L:210
+ * Block File IO
+ *
+ * This block device does direct file I/O to the file backing the virtual block device.
+ * This IO is persistent, i.e. when the guest is done, the device is changed.
+ * If two guests are running and one changes a block, and the other later reads it,
+ * the other guest will see changes made by the first. This property is very exciting
+ * for medata.
+ */
+
+static int
+blockfileio(struct vblk_info *vblk, struct virtio_blk_outhdr *out, struct iovec *iov, int out_num, int in_num, u8 *in)
+{
+ unsigned int wlen;
+ int ret;
+ off64_t off;
- out = convert(&iov[0], struct virtio_blk_outhdr);
- in = convert(&iov[out_num+in_num-1], u8);
off = out->sector * 512;
/* The block device implements "barriers", where the Guest indicates
@@ -1446,15 +1510,8 @@ static bool service_io(struct device *dev)
if (out->type & VIRTIO_BLK_T_BARRIER)
fdatasync(vblk->fd);
- /* In general the virtio block driver is allowed to try SCSI commands.
- * It'd be nice if we supported eject, for example, but we don't. */
- if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
- fprintf(stderr, "Scsi commands unsupported\n");
- *in = VIRTIO_BLK_S_UNSUPP;
- wlen = sizeof(*in);
- } else if (out->type & VIRTIO_BLK_T_OUT) {
- /* Write */
-
+ /* Write */
+ if (out->type & VIRTIO_BLK_T_OUT) {
/* Move to the right location in the block file. This can fail
* if they try to write past end. */
if (lseek64(vblk->fd, off, SEEK_SET) != off)
@@ -1492,7 +1549,51 @@ static bool service_io(struct device *dev)
*in = VIRTIO_BLK_S_IOERR;
}
}
+ printf("block io returns %d\n", wlen);
+ return wlen;
+}
+
+/*L:210
+ * The Disk
+ *
+ * Remember that the block device is handled by a separate I/O thread. We head
+ * straight into the core of that thread here:
+ */
+static bool service_io(struct device *dev)
+{
+ struct vblk_info *vblk = dev->priv;
+ unsigned int head, out_num, in_num, wlen;
+ u8 *in;
+ struct virtio_blk_outhdr *out;
+ struct iovec iov[dev->vq->vring.num];
+
+ /* See if there's a request waiting. If not, nothing to do. */
+ head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+ if (head == dev->vq->vring.num)
+ return false;
+
+ /* Every block request should contain at least one output buffer
+ * (detailing the location on disk and the type of request) and one
+ * input buffer (to hold the result). */
+ if (out_num == 0 || in_num == 0)
+ errx(1, "Bad virtblk cmd %u out=%u in=%u",
+ head, out_num, in_num);
+
+ out = convert(&iov[0], struct virtio_blk_outhdr);
+ in = convert(&iov[out_num+in_num-1], u8);
+ /* In general the virtio block driver is allowed to try SCSI commands.
+ * It'd be nice if we supported eject, for example, but we don't. */
+ if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
+ fprintf(stderr, "Scsi commands unsupported\n");
+ *in = VIRTIO_BLK_S_UNSUPP;
+ wlen = sizeof(*in);
+ } else if (! vblk->map) {
+ wlen = blockfileio(vblk, out, iov, out_num, in_num, in);
+ } else {
+ wlen = mmapfileio(vblk, out, iov, out_num, in_num, in);
+ }
+
/* We can't trigger an IRQ, because we're not the Launcher. It does
* that when we tell it we're done. */
add_used(dev->vq, head, wlen);
@@ -1554,7 +1655,7 @@ static void handle_virtblk_output(int fd, struct virtqueue *vq)
}
/*L:198 This actually sets up a virtual block device. */
-static void setup_block_file(const char *filename)
+static void setup_block_file(const char *filename, int mapfile)
{
int p[2];
struct device *dev;
@@ -1578,6 +1679,22 @@ static void setup_block_file(const char *filename)
vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
vblk->len = lseek64(vblk->fd, 0, SEEK_END);
+ /* If we've been asked to mmap it, do so. We're not going to close
+ * the fd, however, as it's always possible we'll want it later.
+ * Why mmap?
+ * 1. You're running lots and lots of guests, and want to share a lot.
+ * 2. You want to run the guest but not really modify the disk image.
+ */
+ if (mapfile) {
+ vblk->map = mmap(NULL, page_align(vblk->len),
+ PROT_READ|PROT_WRITE, MAP_PRIVATE, vblk->fd, 0);
+ /* We don't want to fall back to IO if mmap fails. The semantics
+ * are utterly different.
+ */
+ if (vblk->map == MAP_FAILED)
+ err(1, "Mmaping physical block device");
+ }
+
/* We support barriers. */
add_feature(dev, VIRTIO_BLK_F_BARRIER);
@@ -1683,6 +1800,7 @@ static struct option opts[] = {
{ "verbose", 0, NULL, 'v' },
{ "tunnet", 1, NULL, 't' },
{ "block", 1, NULL, 'b' },
+ { "pblock", 1, NULL, 'p' },
{ "initrd", 1, NULL, 'i' },
{ NULL },
};
@@ -1755,7 +1873,10 @@ int main(int argc, char *argv[])
setup_tun_net(optarg);
break;
case 'b':
- setup_block_file(optarg);
+ setup_block_file(optarg, 0);
+ break;
+ case 'p':
+ setup_block_file(optarg, 1);
break;
case 'i':
initrd_name = optarg;
_______________________________________________
Lguest mailing list
[email protected]
https://ozlabs.org/mailman/listinfo/lguest